aboutsummaryrefslogtreecommitdiff
path: root/sys/gen.py
diff options
context:
space:
mode:
Diffstat (limited to 'sys/gen.py')
-rwxr-xr-xsys/gen.py419
1 files changed, 419 insertions, 0 deletions
diff --git a/sys/gen.py b/sys/gen.py
new file mode 100755
index 0000000..e1e8c0f
--- /dev/null
+++ b/sys/gen.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python
+
+import re
+import gzip
+from math import log
+
+# supported operating systems/architectures
+operating_system = [ 'linux' ]
+arch = {
+ 'generic' : 'port',
+ 'x86_64' : 'amd64',
+ 'i386' : 'i386',
+ 'arm' : 'arm',
+ 'aarch64' : 'arm64',
+ 'riscv64' : 'riscv64',
+}
+
+# all dictionaries take form: musl => us
+
+# ------------------------------------------------------------------------
+# errno
+
+# name conversions
+errprefix = "sys·Error"
+errno = {
+ "EPERM" : "NotPermit", "ENOENT" : "NoEntry",
+ "ESRCH" : "NoProc", "EINTR" : "Interrupt",
+ "EIO" : "IO", "ENXIO" : "NoDevAddr",
+ "E2BIG" : "ArgLen", "ENOEXEC" : "ExecFmt",
+ "EBADF" : "BadFd", "ECHILD" : "NoChild",
+ "EAGAIN" : "Again", "ENOMEM" : "NoMemory",
+ "EACCES" : "Access", "EFAULT" : "BadAddr",
+ "ENOTBLK" : "NotBlk", "EBUSY" : "Busy",
+ "EEXIST" : "Exists", "EXDEV" : "ExDevice",
+ "ENODEV" : "NoDevice", "ENOTDIR" : "NotDir",
+ "EISDIR" : "IsDir", "EINVAL" : "BadArg",
+ "ENFILE" : "NoFiles", "EMFILE" : "NoFd",
+ "ENOTTY" : "NotTTY", "ETXTBSY" : "TxtBusy",
+ "EFBIG" : "BigFile", "ENOSPC" : "NoSpace",
+ "ESPIPE" : "NoSeek", "EROFS" : "ReadOnly",
+ "EMLINK" : "NoLink", "EPIPE" : "BadPipe",
+ "EDOM" : "BadDomain", "ERANGE" : "BadResult",
+ "EDEADLK" : "Deadlock", "ENAMETOOLONG" : "NameLen",
+ "ENOLCK" : "NoLock", "ENOSYS" : "NoSyscall",
+ "ENOTEMPTY" : "NotEmptyDir", "ELOOP" : "LinkLoop",
+ "EWOULDBLOCK" : "Blocks", "ENOMSG" : "NoMsg",
+ "EIDRM" : "NoIdent", "ECHRNG" : "BadChannel",
+ "EL2NSYNC" : "Lvl2NoSync", "EL3HLT" : "Lvl3Halt",
+ "EL3RST" : "Lvl3Reset", "ELNRNG" : "LongChannel",
+ "EUNATCH" : "NoDriver", "ENOCSI" : "NoCSI",
+ "EL2HLT" : "Lvl2Halt", "EBADE" : "BadExchange",
+ "EBADR" : "BadRequest", "EXFULL" : "FullExchange",
+ "ENOANO" : "NoAnode", "EBADRQC" : "BadRequestCode",
+ "EBADSLT" : "BadSlot", "EDEADLOCK" : "Deadlock2",
+ "EBFONT" : "BadFont", "ENOSTR" : "NoStr",
+ "ENODATA" : "NoData", "ETIME" : "Timeout",
+ "ENOSR" : "NoStream", "ENONET" : "NoNet",
+ "ENOPKG" : "NoPkg", "EREMOTE" : "IsRemote",
+ "ENOLINK" : "DeadLink", "EADV" : "Adv",
+ "ESRMNT" : "Srmnt", "ECOMM" : "Communicate",
+ "EPROTO" : "Protocol", "EMULTIHOP" : "Multihop",
+ "EDOTDOT" : "DotDot", "EBADMSG" : "BadMsg",
+ "EOVERFLOW" : "Overflow", "ENOTUNIQ" : "NotUnique",
+ "EBADFD" : "BadFdState", "EREMCHG" : "NewAddress",
+ "ELIBACC" : "ShlibAccess", "ELIBBAD" : "BadShlib",
+ "ELIBSCN" : "BadLibCode", "ELIBMAX" : "ShlibLen",
+ "ELIBEXEC" : "SlibExec", "EILSEQ" : "BadWideChar",
+ "ERESTART" : "Restart", "ESTRPIPE" : "StreamPipe",
+ "EUSERS" : "UserLen", "ENOTSOCK" : "NotSocket",
+ "EDESTADDRREQ" : "NeedDstAddr", "EMSGSIZE" : "MsgLen",
+ "EPROTOTYPE" : "BadProtocol", "ENOPROTOOPT" : "NoProtocol",
+ "EPROTONOSUPPORT" : "NoProtocolSupport","ESOCKTNOSUPPORT" : "NoSocketSupport",
+ "EOPNOTSUPP" : "ProtocolSupport", "ENOTSUP" : "NoSupport",
+ "EPFNOSUPPORT" : "NoPFSupport", "EAFNOSUPPORT" : "NoAFSupport",
+ "EADDRINUSE" : "AddrInUse", "EADDRNOTAVAIL" : "AddrNotAvail",
+ "ENETDOWN" : "NetDown", "ENETUNREACH" : "NetHang",
+ "ENETRESET" : "NetReset", "ECONNABORTED" : "AbortConnect",
+ "ECONNRESET" : "ResetConnect", "ENOBUFS" : "NoBuffer",
+ "EISCONN" : "IsConnect", "ENOTCONN" : "NotConnect",
+ "ESHUTDOWN" : "Shutdown", "ETOOMANYREFS" : "RefsLen",
+ "ETIMEDOUT" : "TimedOut", "ECONNREFUSED" : "RefuseConnect",
+ "EHOSTDOWN" : "HostDown", "EHOSTUNREACH" : "HostHang",
+ "EALREADY" : "AlreadyConnect", "EINPROGRESS" : "InProgress",
+ "ESTALE" : "Stale", "EUCLEAN" : "NeedClean",
+ "ENOTNAM" : "NotNamed", "ENAVAIL" : "NotAvail",
+ "EISNAM" : "IsNamed", "EREMOTEIO" : "RemoteIO",
+ "EDQUOT" : "DiskQuota", "ENOMEDIUM" : "NoMedium",
+ "EMEDIUMTYPE" : "BadMedium", "ECANCELED" : "Cancel",
+ "ENOKEY" : "NoKey", "EKEYEXPIRED" : "ExpireKey",
+ "EKEYREVOKED" : "RevokeKey", "EKEYREJECTED" : "RejectKey",
+ "EOWNERDEAD" : "OwnerDie", "ENOTRECOVERABLE" : "NoRecover",
+ "ERFKILL" : "RFKill", "EHWPOISON" : "HWPoison",
+}
+
+def errwrap(name):
+ return errprefix+errno[name]
+
+def fmterrno(writer, musl):
+ with open(f"{musl}/bits/errno.h", "r") as reader:
+ for line in reader:
+ word = line.split()
+ if word[0] != "#define":
+ continue
+ try:
+ if word[2].isdigit():
+ putdefine(writer, errwrap(word[1]), word[2], left=30, right=30)
+ else:
+ putdefine(writer, errwrap(word[1]), errwrap(word[2]), left=30, right=30)
+ except:
+ raise KeyError(f"{word[1]} not recognized")
+
+def fmtstrerror(writer, musl):
+ with open(f"{musl}/__strerror.h", "r") as reader:
+ for line in reader:
+ if len(line) == 0 or line[0] != 'E':
+ continue
+ word = line[2:-2].split(",")
+ if word[0] == '0':
+ name = word[0]
+ else:
+ name = errwrap(word[0])
+ name += ','
+ name = 'E(' + name
+ mesg = "".join(w.strip() for w in word[1:]).lower() + ')'
+ print(f"{name:<35}{mesg}", file=writer)
+
+# ------------------------------------------------------------------------
+# syscalls
+
+syscallprefix = "·"
+
+class LanguageModel(object):
+ def __init__(self, word_file):
+ self.word_file = word_file
+
+ # Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
+ with gzip.open(word_file) as f:
+ words = f.read().decode().split()
+ self._wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
+ self._maxword = max(len(x) for x in words)
+
+ def add_word(self, w):
+ num_words = len(self._wordcost) + 1
+ self._wordcost[w] = 0 #log((num_words)*log(num_words))
+ self._maxword = max(self._maxword, len(w))
+
+ def del_word(self, w):
+ if w in self._wordcost:
+ self._wordcost[w] = 1000000000
+
+ def split(self, s):
+ """Uses dynamic programming to infer the location of spaces in a string without spaces."""
+ l = [self._split(x) for x in REGEXP.split(s)]
+ return [item for sublist in l for item in sublist]
+
+ def _split(self, s):
+ # Find the best match for the i first characters, assuming cost has
+ # been built for the i-1 first characters.
+ # Returns a pair (match_cost, match_length).
+ def best_match(i):
+ candidates = enumerate(reversed(cost[max(0, i-self._maxword):i]))
+ return min((c + self._wordcost.get(s[i-k-1:i].lower(), 9e999), k+1) for k,c in candidates)
+
+ # Build the cost array.
+ cost = [0]
+ for i in range(1,len(s)+1):
+ c,k = best_match(i)
+ cost.append(c)
+
+ # Backtrack to recover the minimal-cost string.
+ out = []
+ i = len(s)
+ while i>0:
+ c,k = best_match(i)
+ assert c == cost[i]
+ # Apostrophe and digit handling (added by Genesys)
+ newToken = True
+ if not s[i-k:i] == "'": # ignore a lone apostrophe
+ if len(out) > 0:
+ # re-attach split 's and split digits
+ if out[-1] == "'s" or (s[i-1].isdigit() and out[-1][0].isdigit()): # digit followed by digit
+ out[-1] = s[i-k:i] + out[-1] # combine current token with previous token
+ newToken = False
+ # (End of Genesys addition)
+
+ if newToken:
+ out.append(s[i-k:i])
+
+ i -= k
+
+ return reversed(out)
+
+DEFAULT = LanguageModel("../data/words.txt.gz")
+REGEXP = re.compile("[^a-zA-Z0-9']+")
+
+def splitwords(s):
+ return DEFAULT.split(s)
+
+for w in ["pi", "sid"]:
+ DEFAULT.del_word(w)
+
+for w in [
+ "clock", "idle", "uring", "shm", "sem", "set", "ctl", "cntl",
+ "remap", "unmap", "sched", "fd", "io", "pid",
+ "recv", "chmod", "chown", "lock", "stat", "sys",
+ "async", "brk", "id", "unlock", "cwd", "syms",
+ "attr", "chroot", "chdir", "mkdir", "rmdir",
+ "gid", "uid", "serv", "acct", "unlink", "futex"]:
+ DEFAULT.add_word(w)
+
+corpus = {
+ "msgsnd" : "msgsend", "msgrcv" : "msgrecv",
+ "creat" : "create", "mincore" : "mloaded",
+}
+
+def syscall(musl, prefix="__NR_"):
+ ours = musl.removeprefix(prefix)
+
+ for source,target in corpus.items():
+ if target not in ours:
+ ours = ours.replace(source,target)
+
+ return syscallprefix+"".join(word.title() for word in splitwords(ours))
+
+def fmtsysnum(reader, writer):
+ for line in reader:
+ word = line.split()
+ if len(word) != 3 or word[0] != "#define":
+ continue
+
+ call = word[1]
+ sysn = word[2]
+ head = syscall(call)
+ if sysn.isdigit():
+ print(f"#define {head:<30} {sysn:>30}", file=writer)
+
+def fmtsysasm(reader, writer):
+ putline(writer)
+ putcomment(writer, "assembly routines")
+ putdefine(writer, "asm", "__asm__ __volatile__", left=5)
+ for line in reader:
+ if line.startswith("#define __SYSCALL_LL") \
+ or line.startswith("((union") \
+ or line.startswith("#define SYSCALL_IPC") \
+ or line.startswith("#define SYSCALL_FADVISE") \
+ or line.startswith("#define VDSO") \
+ or line.startswith("#define IPC") \
+ or line.startswith("/*") \
+ or line.strip().startswith("*"):
+ continue
+
+ line = line.replace("__syscall", "_syscall")
+ line = line.replace("__inline", "inline")
+ line = line.replace("__asm__ __volatile__ ", "asm")
+ line = line.replace("\t", " ")
+ line = line.replace("unsigned long", "ulong")
+ line = line.replace("static inline ", "static inline\n")
+ print(line, file=writer, end="")
+
+def fmtsyscall(writer, musl):
+ with open(f"{musl}/bits/syscall.h.in") as io:
+ fmtsysnum(io, writer)
+
+ with open(f"{musl}/syscall_arch.h") as io:
+ fmtsysasm(io, writer)
+
+def fmtsyscallold(writer, musl):
+ with open(musl,"r") as reader:
+ needend=False
+ for line in reader:
+ if line.startswith("#define SYS_") \
+ or line.startswith("#undef SYS_") \
+ or line.startswith("#ifdef SYS_") \
+ or line.startswith("#ifndef SYS_") \
+ or(line.startswith("#endif") and needend):
+ word = line.split()
+ name = [syscall(w,prefix="SYS_") for w in word[1:]]
+ if word[0] == "#endif":
+ print(word[0], file=writer)
+ putline(writer)
+ needend=False
+ elif word[0] == "#ifdef" or word[0] == "#ifndef":
+ needend=True
+ print(word[0], name[0], file=writer)
+ else:
+ if word[0] == "#undef":
+ print(f"# {word[0][1:]:<8}{name[0]}", file=writer)
+ else:
+ print(f"# {word[0][1:]:<8}{name[0]:<10} {name[1]}", file=writer)
+
+# ------------------------------------------------------------------------
+# types
+
+stdintdefs=[
+ "INTPTR_MIN", "INTPTR_MAX", "UINTPTR_MIN", "UINTPTR_MAX",
+ "PTRDIFF_MIN", "PTRDIFF_MAX", "SIZE_MAX", "LONG_MAX",
+]
+
+int8 = "char"
+int16 = "short"
+int32 = "int"
+int64 = {'x86_64':"long",'i386':"long long",'arm':"long long",'aarch64':"long",'riscv64':"long"}
+addr = {'x86_64':"long",'i386':"int",'arm':"int",'aarch64':"long",'riscv64':"long"}
+
+long64="0x7fffffffffffffffL"
+long32="0x7fffffffL"
+maxlong={'x86_64':long64, 'i386':long32, 'arm':long32, 'aarch64': long64, 'riscv64': long64}
+
+def fmtstdint(reader, writer):
+ for line in reader:
+ for intdef in stdintdefs:
+ if intdef in line:
+ word = line.split()
+ if word[0] != "#define":
+ continue
+ putdefine(writer, word[1], word[2])
+
+def fmtbits(writer, musl, arch):
+ putheader(writer)
+ putline(writer)
+ putcomment(writer, "base types")
+
+ putdefine(writer, "INT8", int8)
+ putdefine(writer, "INT16", int16)
+ putdefine(writer, "INT32", int32)
+ putdefine(writer, "INT64", int64[arch])
+ putdefine(writer, "ADDR", addr[arch])
+
+ putline(writer)
+ putcomment(writer, "abi")
+
+ limits = f"{musl}/bits/limits.h"
+ if os.path.exists(limits):
+ with open(limits) as reader:
+ for line in reader:
+ word = line.split()
+ putdefine(writer, word[1], word[2])
+ else:
+ print(f"----> not found {limits}", file=stderr)
+
+ with open(f"{musl}/bits/stdint.h") as io:
+ fmtstdint(io, writer)
+
+ putdefine(writer, "LONG_MAX", maxlong[arch])
+
+# ------------------------------------------------------------------------
+# main point of entry
+
+import os
+
+from sys import stdout, stderr
+from os import makedirs as mkpath
+
+def putheader(writer):
+ print("#pragma once", file=writer)
+
+def putline(writer):
+ print("", file=writer)
+
+def putcomment(writer, string):
+ print(f"/* {string} */", file=writer)
+
+def putdefine(writer, name, value, left=20, right=00):
+ print(f"#define {name:<{left}} {value:<{right}}", file=writer)
+
+muslroot = "../vendor/musl"
+if __name__ == "__main__":
+ # architecture specifics
+ for source,target in arch.items():
+ if source == "generic":
+ continue
+
+ print(f"> for {target}",file=stderr)
+
+ musl = f"{muslroot}/arch/{source}"
+
+ libn = f"linux/{target}"
+ libnarch = f"{libn}/arch"
+ mkpath(libnarch, exist_ok=True)
+
+ print("--> emitting types", file=stderr)
+ with open(f"{libnarch}/bits.h", "w") as io:
+ fmtbits(io, musl, source)
+
+ print("--> emitting syscalls", file=stderr)
+ with open(f"{libn}/syscall.h", "w") as io:
+ fmtsyscall(io,musl)
+
+ # TODO: generalize away from linux
+ # portable operating system
+ print("--> emitting errno", file=stderr)
+ source, target = "generic", "port"
+
+ musl = f"{muslroot}/arch/{source}"
+ libn = f"linux/{target}/arch"
+ mkpath(libnarch, exist_ok=True)
+ with open(f"{libn}/errno.h","w") as io:
+ fmterrno(io, musl)
+
+ musl = f"{muslroot}/src/errno"
+ libn = "src/errno.inc.h"
+ with open(libn,"w") as io:
+ fmtstrerror(io, musl)
+
+ libn = "src/internal.h"
+ musl = f"{muslroot}/src/internal/syscall.h"
+
+ needfix = False
+ with open(libn,"r") as file:
+ line = file.readlines()[-3:]
+ if "/*********************************************" in line[0]:
+ if "* XXX: below this line is computer generated" in line[1]:
+ if "*********************************************/" in line[2]:
+ needfix = True
+
+ if needfix:
+ print("--> appending legacy syscall fixup", file=stderr)
+ with open(libn,"a") as io:
+ fmtsyscallold(io, musl)