From 7eb20457ff3d80cc81d54204ffd2c9edaf991905 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 17 Nov 2021 17:38:58 -0800 Subject: automated platform specific code parsing --- sys/gen.py | 419 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100755 sys/gen.py (limited to 'sys/gen.py') diff --git a/sys/gen.py b/sys/gen.py new file mode 100755 index 0000000..e1e8c0f --- /dev/null +++ b/sys/gen.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python + +import re +import gzip +from math import log + +# supported operating systems/architectures +operating_system = [ 'linux' ] +arch = { + 'generic' : 'port', + 'x86_64' : 'amd64', + 'i386' : 'i386', + 'arm' : 'arm', + 'aarch64' : 'arm64', + 'riscv64' : 'riscv64', +} + +# all dictionaries take form: musl => us + +# ------------------------------------------------------------------------ +# errno + +# name conversions +errprefix = "sys·Error" +errno = { + "EPERM" : "NotPermit", "ENOENT" : "NoEntry", + "ESRCH" : "NoProc", "EINTR" : "Interrupt", + "EIO" : "IO", "ENXIO" : "NoDevAddr", + "E2BIG" : "ArgLen", "ENOEXEC" : "ExecFmt", + "EBADF" : "BadFd", "ECHILD" : "NoChild", + "EAGAIN" : "Again", "ENOMEM" : "NoMemory", + "EACCES" : "Access", "EFAULT" : "BadAddr", + "ENOTBLK" : "NotBlk", "EBUSY" : "Busy", + "EEXIST" : "Exists", "EXDEV" : "ExDevice", + "ENODEV" : "NoDevice", "ENOTDIR" : "NotDir", + "EISDIR" : "IsDir", "EINVAL" : "BadArg", + "ENFILE" : "NoFiles", "EMFILE" : "NoFd", + "ENOTTY" : "NotTTY", "ETXTBSY" : "TxtBusy", + "EFBIG" : "BigFile", "ENOSPC" : "NoSpace", + "ESPIPE" : "NoSeek", "EROFS" : "ReadOnly", + "EMLINK" : "NoLink", "EPIPE" : "BadPipe", + "EDOM" : "BadDomain", "ERANGE" : "BadResult", + "EDEADLK" : "Deadlock", "ENAMETOOLONG" : "NameLen", + "ENOLCK" : "NoLock", "ENOSYS" : "NoSyscall", + "ENOTEMPTY" : "NotEmptyDir", "ELOOP" : "LinkLoop", + "EWOULDBLOCK" : "Blocks", "ENOMSG" : "NoMsg", + "EIDRM" : "NoIdent", "ECHRNG" : "BadChannel", + "EL2NSYNC" : "Lvl2NoSync", "EL3HLT" : "Lvl3Halt", + "EL3RST" : "Lvl3Reset", "ELNRNG" : "LongChannel", + "EUNATCH" : "NoDriver", "ENOCSI" : "NoCSI", + "EL2HLT" : "Lvl2Halt", "EBADE" : "BadExchange", + "EBADR" : "BadRequest", "EXFULL" : "FullExchange", + "ENOANO" : "NoAnode", "EBADRQC" : "BadRequestCode", + "EBADSLT" : "BadSlot", "EDEADLOCK" : "Deadlock2", + "EBFONT" : "BadFont", "ENOSTR" : "NoStr", + "ENODATA" : "NoData", "ETIME" : "Timeout", + "ENOSR" : "NoStream", "ENONET" : "NoNet", + "ENOPKG" : "NoPkg", "EREMOTE" : "IsRemote", + "ENOLINK" : "DeadLink", "EADV" : "Adv", + "ESRMNT" : "Srmnt", "ECOMM" : "Communicate", + "EPROTO" : "Protocol", "EMULTIHOP" : "Multihop", + "EDOTDOT" : "DotDot", "EBADMSG" : "BadMsg", + "EOVERFLOW" : "Overflow", "ENOTUNIQ" : "NotUnique", + "EBADFD" : "BadFdState", "EREMCHG" : "NewAddress", + "ELIBACC" : "ShlibAccess", "ELIBBAD" : "BadShlib", + "ELIBSCN" : "BadLibCode", "ELIBMAX" : "ShlibLen", + "ELIBEXEC" : "SlibExec", "EILSEQ" : "BadWideChar", + "ERESTART" : "Restart", "ESTRPIPE" : "StreamPipe", + "EUSERS" : "UserLen", "ENOTSOCK" : "NotSocket", + "EDESTADDRREQ" : "NeedDstAddr", "EMSGSIZE" : "MsgLen", + "EPROTOTYPE" : "BadProtocol", "ENOPROTOOPT" : "NoProtocol", + "EPROTONOSUPPORT" : "NoProtocolSupport","ESOCKTNOSUPPORT" : "NoSocketSupport", + "EOPNOTSUPP" : "ProtocolSupport", "ENOTSUP" : "NoSupport", + "EPFNOSUPPORT" : "NoPFSupport", "EAFNOSUPPORT" : "NoAFSupport", + "EADDRINUSE" : "AddrInUse", "EADDRNOTAVAIL" : "AddrNotAvail", + "ENETDOWN" : "NetDown", "ENETUNREACH" : "NetHang", + "ENETRESET" : "NetReset", "ECONNABORTED" : "AbortConnect", + "ECONNRESET" : "ResetConnect", "ENOBUFS" : "NoBuffer", + "EISCONN" : "IsConnect", "ENOTCONN" : "NotConnect", + "ESHUTDOWN" : "Shutdown", "ETOOMANYREFS" : "RefsLen", + "ETIMEDOUT" : "TimedOut", "ECONNREFUSED" : "RefuseConnect", + "EHOSTDOWN" : "HostDown", "EHOSTUNREACH" : "HostHang", + "EALREADY" : "AlreadyConnect", "EINPROGRESS" : "InProgress", + "ESTALE" : "Stale", "EUCLEAN" : "NeedClean", + "ENOTNAM" : "NotNamed", "ENAVAIL" : "NotAvail", + "EISNAM" : "IsNamed", "EREMOTEIO" : "RemoteIO", + "EDQUOT" : "DiskQuota", "ENOMEDIUM" : "NoMedium", + "EMEDIUMTYPE" : "BadMedium", "ECANCELED" : "Cancel", + "ENOKEY" : "NoKey", "EKEYEXPIRED" : "ExpireKey", + "EKEYREVOKED" : "RevokeKey", "EKEYREJECTED" : "RejectKey", + "EOWNERDEAD" : "OwnerDie", "ENOTRECOVERABLE" : "NoRecover", + "ERFKILL" : "RFKill", "EHWPOISON" : "HWPoison", +} + +def errwrap(name): + return errprefix+errno[name] + +def fmterrno(writer, musl): + with open(f"{musl}/bits/errno.h", "r") as reader: + for line in reader: + word = line.split() + if word[0] != "#define": + continue + try: + if word[2].isdigit(): + putdefine(writer, errwrap(word[1]), word[2], left=30, right=30) + else: + putdefine(writer, errwrap(word[1]), errwrap(word[2]), left=30, right=30) + except: + raise KeyError(f"{word[1]} not recognized") + +def fmtstrerror(writer, musl): + with open(f"{musl}/__strerror.h", "r") as reader: + for line in reader: + if len(line) == 0 or line[0] != 'E': + continue + word = line[2:-2].split(",") + if word[0] == '0': + name = word[0] + else: + name = errwrap(word[0]) + name += ',' + name = 'E(' + name + mesg = "".join(w.strip() for w in word[1:]).lower() + ')' + print(f"{name:<35}{mesg}", file=writer) + +# ------------------------------------------------------------------------ +# syscalls + +syscallprefix = "·" + +class LanguageModel(object): + def __init__(self, word_file): + self.word_file = word_file + + # Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability). + with gzip.open(word_file) as f: + words = f.read().decode().split() + self._wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words)) + self._maxword = max(len(x) for x in words) + + def add_word(self, w): + num_words = len(self._wordcost) + 1 + self._wordcost[w] = 0 #log((num_words)*log(num_words)) + self._maxword = max(self._maxword, len(w)) + + def del_word(self, w): + if w in self._wordcost: + self._wordcost[w] = 1000000000 + + def split(self, s): + """Uses dynamic programming to infer the location of spaces in a string without spaces.""" + l = [self._split(x) for x in REGEXP.split(s)] + return [item for sublist in l for item in sublist] + + def _split(self, s): + # Find the best match for the i first characters, assuming cost has + # been built for the i-1 first characters. + # Returns a pair (match_cost, match_length). + def best_match(i): + candidates = enumerate(reversed(cost[max(0, i-self._maxword):i])) + return min((c + self._wordcost.get(s[i-k-1:i].lower(), 9e999), k+1) for k,c in candidates) + + # Build the cost array. + cost = [0] + for i in range(1,len(s)+1): + c,k = best_match(i) + cost.append(c) + + # Backtrack to recover the minimal-cost string. + out = [] + i = len(s) + while i>0: + c,k = best_match(i) + assert c == cost[i] + # Apostrophe and digit handling (added by Genesys) + newToken = True + if not s[i-k:i] == "'": # ignore a lone apostrophe + if len(out) > 0: + # re-attach split 's and split digits + if out[-1] == "'s" or (s[i-1].isdigit() and out[-1][0].isdigit()): # digit followed by digit + out[-1] = s[i-k:i] + out[-1] # combine current token with previous token + newToken = False + # (End of Genesys addition) + + if newToken: + out.append(s[i-k:i]) + + i -= k + + return reversed(out) + +DEFAULT = LanguageModel("../data/words.txt.gz") +REGEXP = re.compile("[^a-zA-Z0-9']+") + +def splitwords(s): + return DEFAULT.split(s) + +for w in ["pi", "sid"]: + DEFAULT.del_word(w) + +for w in [ + "clock", "idle", "uring", "shm", "sem", "set", "ctl", "cntl", + "remap", "unmap", "sched", "fd", "io", "pid", + "recv", "chmod", "chown", "lock", "stat", "sys", + "async", "brk", "id", "unlock", "cwd", "syms", + "attr", "chroot", "chdir", "mkdir", "rmdir", + "gid", "uid", "serv", "acct", "unlink", "futex"]: + DEFAULT.add_word(w) + +corpus = { + "msgsnd" : "msgsend", "msgrcv" : "msgrecv", + "creat" : "create", "mincore" : "mloaded", +} + +def syscall(musl, prefix="__NR_"): + ours = musl.removeprefix(prefix) + + for source,target in corpus.items(): + if target not in ours: + ours = ours.replace(source,target) + + return syscallprefix+"".join(word.title() for word in splitwords(ours)) + +def fmtsysnum(reader, writer): + for line in reader: + word = line.split() + if len(word) != 3 or word[0] != "#define": + continue + + call = word[1] + sysn = word[2] + head = syscall(call) + if sysn.isdigit(): + print(f"#define {head:<30} {sysn:>30}", file=writer) + +def fmtsysasm(reader, writer): + putline(writer) + putcomment(writer, "assembly routines") + putdefine(writer, "asm", "__asm__ __volatile__", left=5) + for line in reader: + if line.startswith("#define __SYSCALL_LL") \ + or line.startswith("((union") \ + or line.startswith("#define SYSCALL_IPC") \ + or line.startswith("#define SYSCALL_FADVISE") \ + or line.startswith("#define VDSO") \ + or line.startswith("#define IPC") \ + or line.startswith("/*") \ + or line.strip().startswith("*"): + continue + + line = line.replace("__syscall", "_syscall") + line = line.replace("__inline", "inline") + line = line.replace("__asm__ __volatile__ ", "asm") + line = line.replace("\t", " ") + line = line.replace("unsigned long", "ulong") + line = line.replace("static inline ", "static inline\n") + print(line, file=writer, end="") + +def fmtsyscall(writer, musl): + with open(f"{musl}/bits/syscall.h.in") as io: + fmtsysnum(io, writer) + + with open(f"{musl}/syscall_arch.h") as io: + fmtsysasm(io, writer) + +def fmtsyscallold(writer, musl): + with open(musl,"r") as reader: + needend=False + for line in reader: + if line.startswith("#define SYS_") \ + or line.startswith("#undef SYS_") \ + or line.startswith("#ifdef SYS_") \ + or line.startswith("#ifndef SYS_") \ + or(line.startswith("#endif") and needend): + word = line.split() + name = [syscall(w,prefix="SYS_") for w in word[1:]] + if word[0] == "#endif": + print(word[0], file=writer) + putline(writer) + needend=False + elif word[0] == "#ifdef" or word[0] == "#ifndef": + needend=True + print(word[0], name[0], file=writer) + else: + if word[0] == "#undef": + print(f"# {word[0][1:]:<8}{name[0]}", file=writer) + else: + print(f"# {word[0][1:]:<8}{name[0]:<10} {name[1]}", file=writer) + +# ------------------------------------------------------------------------ +# types + +stdintdefs=[ + "INTPTR_MIN", "INTPTR_MAX", "UINTPTR_MIN", "UINTPTR_MAX", + "PTRDIFF_MIN", "PTRDIFF_MAX", "SIZE_MAX", "LONG_MAX", +] + +int8 = "char" +int16 = "short" +int32 = "int" +int64 = {'x86_64':"long",'i386':"long long",'arm':"long long",'aarch64':"long",'riscv64':"long"} +addr = {'x86_64':"long",'i386':"int",'arm':"int",'aarch64':"long",'riscv64':"long"} + +long64="0x7fffffffffffffffL" +long32="0x7fffffffL" +maxlong={'x86_64':long64, 'i386':long32, 'arm':long32, 'aarch64': long64, 'riscv64': long64} + +def fmtstdint(reader, writer): + for line in reader: + for intdef in stdintdefs: + if intdef in line: + word = line.split() + if word[0] != "#define": + continue + putdefine(writer, word[1], word[2]) + +def fmtbits(writer, musl, arch): + putheader(writer) + putline(writer) + putcomment(writer, "base types") + + putdefine(writer, "INT8", int8) + putdefine(writer, "INT16", int16) + putdefine(writer, "INT32", int32) + putdefine(writer, "INT64", int64[arch]) + putdefine(writer, "ADDR", addr[arch]) + + putline(writer) + putcomment(writer, "abi") + + limits = f"{musl}/bits/limits.h" + if os.path.exists(limits): + with open(limits) as reader: + for line in reader: + word = line.split() + putdefine(writer, word[1], word[2]) + else: + print(f"----> not found {limits}", file=stderr) + + with open(f"{musl}/bits/stdint.h") as io: + fmtstdint(io, writer) + + putdefine(writer, "LONG_MAX", maxlong[arch]) + +# ------------------------------------------------------------------------ +# main point of entry + +import os + +from sys import stdout, stderr +from os import makedirs as mkpath + +def putheader(writer): + print("#pragma once", file=writer) + +def putline(writer): + print("", file=writer) + +def putcomment(writer, string): + print(f"/* {string} */", file=writer) + +def putdefine(writer, name, value, left=20, right=00): + print(f"#define {name:<{left}} {value:<{right}}", file=writer) + +muslroot = "../vendor/musl" +if __name__ == "__main__": + # architecture specifics + for source,target in arch.items(): + if source == "generic": + continue + + print(f"> for {target}",file=stderr) + + musl = f"{muslroot}/arch/{source}" + + libn = f"linux/{target}" + libnarch = f"{libn}/arch" + mkpath(libnarch, exist_ok=True) + + print("--> emitting types", file=stderr) + with open(f"{libnarch}/bits.h", "w") as io: + fmtbits(io, musl, source) + + print("--> emitting syscalls", file=stderr) + with open(f"{libn}/syscall.h", "w") as io: + fmtsyscall(io,musl) + + # TODO: generalize away from linux + # portable operating system + print("--> emitting errno", file=stderr) + source, target = "generic", "port" + + musl = f"{muslroot}/arch/{source}" + libn = f"linux/{target}/arch" + mkpath(libnarch, exist_ok=True) + with open(f"{libn}/errno.h","w") as io: + fmterrno(io, musl) + + musl = f"{muslroot}/src/errno" + libn = "src/errno.inc.h" + with open(libn,"w") as io: + fmtstrerror(io, musl) + + libn = "src/internal.h" + musl = f"{muslroot}/src/internal/syscall.h" + + needfix = False + with open(libn,"r") as file: + line = file.readlines()[-3:] + if "/*********************************************" in line[0]: + if "* XXX: below this line is computer generated" in line[1]: + if "*********************************************/" in line[2]: + needfix = True + + if needfix: + print("--> appending legacy syscall fixup", file=stderr) + with open(libn,"a") as io: + fmtsyscallold(io, musl) -- cgit v1.2.1