#!/usr/bin/env python import re import gzip from math import log # supported operating systems/architectures operating_system = [ 'linux' ] arch = { 'generic' : 'port', 'x86_64' : 'amd64', 'i386' : 'i386', 'arm' : 'arm', 'aarch64' : 'arm64', 'riscv64' : 'riscv64', } # all dictionaries take form: musl => us # ------------------------------------------------------------------------ # errno # name conversions errprefix = "sys·Error" errno = { "EPERM" : "NotPermit", "ENOENT" : "NoEntry", "ESRCH" : "NoProc", "EINTR" : "Interrupt", "EIO" : "IO", "ENXIO" : "NoDevAddr", "E2BIG" : "ArgLen", "ENOEXEC" : "ExecFmt", "EBADF" : "BadFd", "ECHILD" : "NoChild", "EAGAIN" : "Again", "ENOMEM" : "NoMemory", "EACCES" : "Access", "EFAULT" : "BadAddr", "ENOTBLK" : "NotBlk", "EBUSY" : "Busy", "EEXIST" : "Exists", "EXDEV" : "ExDevice", "ENODEV" : "NoDevice", "ENOTDIR" : "NotDir", "EISDIR" : "IsDir", "EINVAL" : "BadArg", "ENFILE" : "NoFiles", "EMFILE" : "NoFd", "ENOTTY" : "NotTTY", "ETXTBSY" : "TxtBusy", "EFBIG" : "BigFile", "ENOSPC" : "NoSpace", "ESPIPE" : "NoSeek", "EROFS" : "ReadOnly", "EMLINK" : "NoLink", "EPIPE" : "BadPipe", "EDOM" : "BadDomain", "ERANGE" : "BadResult", "EDEADLK" : "Deadlock", "ENAMETOOLONG" : "NameLen", "ENOLCK" : "NoLock", "ENOSYS" : "NoSyscall", "ENOTEMPTY" : "NotEmptyDir", "ELOOP" : "LinkLoop", "EWOULDBLOCK" : "Blocks", "ENOMSG" : "NoMsg", "EIDRM" : "NoIdent", "ECHRNG" : "BadChannel", "EL2NSYNC" : "Lvl2NoSync", "EL3HLT" : "Lvl3Halt", "EL3RST" : "Lvl3Reset", "ELNRNG" : "LongChannel", "EUNATCH" : "NoDriver", "ENOCSI" : "NoCSI", "EL2HLT" : "Lvl2Halt", "EBADE" : "BadExchange", "EBADR" : "BadRequest", "EXFULL" : "FullExchange", "ENOANO" : "NoAnode", "EBADRQC" : "BadRequestCode", "EBADSLT" : "BadSlot", "EDEADLOCK" : "Deadlock2", "EBFONT" : "BadFont", "ENOSTR" : "NoStr", "ENODATA" : "NoData", "ETIME" : "Timeout", "ENOSR" : "NoStream", "ENONET" : "NoNet", "ENOPKG" : "NoPkg", "EREMOTE" : "IsRemote", "ENOLINK" : "DeadLink", "EADV" : "Adv", "ESRMNT" : "Srmnt", "ECOMM" : "Communicate", "EPROTO" : "Protocol", "EMULTIHOP" : "Multihop", "EDOTDOT" : "DotDot", "EBADMSG" : "BadMsg", "EOVERFLOW" : "Overflow", "ENOTUNIQ" : "NotUnique", "EBADFD" : "BadFdState", "EREMCHG" : "NewAddress", "ELIBACC" : "ShlibAccess", "ELIBBAD" : "BadShlib", "ELIBSCN" : "BadLibCode", "ELIBMAX" : "ShlibLen", "ELIBEXEC" : "SlibExec", "EILSEQ" : "BadWideChar", "ERESTART" : "Restart", "ESTRPIPE" : "StreamPipe", "EUSERS" : "UserLen", "ENOTSOCK" : "NotSocket", "EDESTADDRREQ" : "NeedDstAddr", "EMSGSIZE" : "MsgLen", "EPROTOTYPE" : "BadProtocol", "ENOPROTOOPT" : "NoProtocol", "EPROTONOSUPPORT" : "NoProtocolSupport","ESOCKTNOSUPPORT" : "NoSocketSupport", "EOPNOTSUPP" : "ProtocolSupport", "ENOTSUP" : "NoSupport", "EPFNOSUPPORT" : "NoPFSupport", "EAFNOSUPPORT" : "NoAFSupport", "EADDRINUSE" : "AddrInUse", "EADDRNOTAVAIL" : "AddrNotAvail", "ENETDOWN" : "NetDown", "ENETUNREACH" : "NetHang", "ENETRESET" : "NetReset", "ECONNABORTED" : "AbortConnect", "ECONNRESET" : "ResetConnect", "ENOBUFS" : "NoBuffer", "EISCONN" : "IsConnect", "ENOTCONN" : "NotConnect", "ESHUTDOWN" : "Shutdown", "ETOOMANYREFS" : "RefsLen", "ETIMEDOUT" : "TimedOut", "ECONNREFUSED" : "RefuseConnect", "EHOSTDOWN" : "HostDown", "EHOSTUNREACH" : "HostHang", "EALREADY" : "AlreadyConnect", "EINPROGRESS" : "InProgress", "ESTALE" : "Stale", "EUCLEAN" : "NeedClean", "ENOTNAM" : "NotNamed", "ENAVAIL" : "NotAvail", "EISNAM" : "IsNamed", "EREMOTEIO" : "RemoteIO", "EDQUOT" : "DiskQuota", "ENOMEDIUM" : "NoMedium", "EMEDIUMTYPE" : "BadMedium", "ECANCELED" : "Cancel", "ENOKEY" : "NoKey", "EKEYEXPIRED" : "ExpireKey", "EKEYREVOKED" : "RevokeKey", "EKEYREJECTED" : "RejectKey", "EOWNERDEAD" : "OwnerDie", "ENOTRECOVERABLE" : "NoRecover", "ERFKILL" : "RFKill", "EHWPOISON" : "HWPoison", } def errwrap(name): return errprefix+errno[name] def fmterrno(writer, musl): with open(f"{musl}/bits/errno.h", "r") as reader: for line in reader: word = line.split() if word[0] != "#define": continue try: if word[2].isdigit(): putdefine(writer, errwrap(word[1]), word[2], left=30, right=0) else: putdefine(writer, errwrap(word[1]), errwrap(word[2]), left=30, right=0) except: raise KeyError(f"{word[1]} not recognized") def fmtstrerror(writer, musl): with open(f"{musl}/__strerror.h", "r") as reader: for line in reader: if len(line) == 0 or line[0] != 'E': continue word = line[2:-2].split(",") if word[0] == '0': name = word[0] else: name = errwrap(word[0]) name += ',' name = 'E(' + name mesg = "".join(w.strip() for w in word[1:]).lower() + ')' print(f"{name:<35}{mesg}", file=writer) # ------------------------------------------------------------------------ # syscalls syscallprefix = "·" class LanguageModel(object): def __init__(self, word_file): self.word_file = word_file # Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability). with gzip.open(word_file) as f: words = f.read().decode().split() self._wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words)) self._maxword = max(len(x) for x in words) def add_word(self, w): num_words = len(self._wordcost) + 1 self._wordcost[w] = 0 #log((num_words)*log(num_words)) self._maxword = max(self._maxword, len(w)) def del_word(self, w): if w in self._wordcost: self._wordcost[w] = 1000000000 def split(self, s): """Uses dynamic programming to infer the location of spaces in a string without spaces.""" l = [self._split(x) for x in REGEXP.split(s)] return [item for sublist in l for item in sublist] def _split(self, s): # Find the best match for the i first characters, assuming cost has # been built for the i-1 first characters. # Returns a pair (match_cost, match_length). def best_match(i): candidates = enumerate(reversed(cost[max(0, i-self._maxword):i])) return min((c + self._wordcost.get(s[i-k-1:i].lower(), 9e999), k+1) for k,c in candidates) # Build the cost array. cost = [0] for i in range(1,len(s)+1): c,k = best_match(i) cost.append(c) # Backtrack to recover the minimal-cost string. out = [] i = len(s) while i>0: c,k = best_match(i) assert c == cost[i] # Apostrophe and digit handling (added by Genesys) newToken = True if not s[i-k:i] == "'": # ignore a lone apostrophe if len(out) > 0: # re-attach split 's and split digits if out[-1] == "'s" or (s[i-1].isdigit() and out[-1][0].isdigit()): # digit followed by digit out[-1] = s[i-k:i] + out[-1] # combine current token with previous token newToken = False # (End of Genesys addition) if newToken: out.append(s[i-k:i]) i -= k return reversed(out) DEFAULT = LanguageModel("../data/words.txt.gz") REGEXP = re.compile("[^a-zA-Z0-9']+") def splitwords(s): return DEFAULT.split(s) for w in ["pi", "sid"]: DEFAULT.del_word(w) for w in [ "clock", "idle", "uring", "shm", "sem", "set", "ctl", "cntl", "remap", "unmap", "sched", "fd", "io", "pid", "recv", "chmod", "chown", "lock", "stat", "sys", "async", "brk", "id", "unlock", "cwd", "syms", "attr", "chroot", "chdir", "mkdir", "rmdir", "gid", "uid", "serv", "acct", "unlink", "futex"]: DEFAULT.add_word(w) corpus = { "msgsnd" : "msgsend", "msgrcv" : "msgrecv", "creat" : "create", "mincore" : "mloaded", } def syscall(musl, prefix="__NR_"): ours = musl.removeprefix(prefix) for source,target in corpus.items(): if target not in ours: ours = ours.replace(source,target) return syscallprefix+"".join(word.title() for word in splitwords(ours)) def fmtsysnum(reader, writer): for line in reader: word = line.split() if len(word) != 3 or word[0] != "#define": continue call = word[1] sysn = word[2] head = syscall(call) if sysn.isdigit(): print(f"#define {head:<30} {sysn}", file=writer) def fmtsysasm(reader, writer): putline(writer) putcomment(writer, "assembly routines") putdefine(writer, "asm", "__asm__ __volatile__", left=5) for line in reader: if line.startswith("#define __SYSCALL_LL") \ or line.startswith("((union") \ or line.startswith("#define SYSCALL_IPC") \ or line.startswith("#define SYSCALL_FADVISE") \ or line.startswith("#define VDSO") \ or line.startswith("#define IPC") \ or line.startswith("/*") \ or line.strip().startswith("*"): continue line = line.replace("__syscall", "_syscall") line = line.replace("__inline", "inline") line = line.replace("__asm__ __volatile__ ", "asm") line = line.replace("\t", " ") line = line.replace("unsigned long", "ulong") line = line.replace("static inline ", "static inline\n") print(line, file=writer, end="") def fmtsyscall(writer, musl): with open(f"{musl}/bits/syscall.h.in") as io: fmtsysnum(io, writer) with open(f"{musl}/syscall_arch.h") as io: fmtsysasm(io, writer) def fmtsyscallold(writer, musl): with open(musl,"r") as reader: needend=False for line in reader: if line.startswith("#define SYS_") \ or line.startswith("#undef SYS_") \ or line.startswith("#ifdef SYS_") \ or line.startswith("#ifndef SYS_") \ or(line.startswith("#endif") and needend): word = line.split() name = [syscall(w,prefix="SYS_") for w in word[1:]] if word[0] == "#endif": print(word[0], file=writer) putline(writer) needend=False elif word[0] == "#ifdef" or word[0] == "#ifndef": needend=True print(word[0], name[0], file=writer) else: if word[0] == "#undef": print(f"# {word[0][1:]:<8}{name[0]}", file=writer) else: print(f"# {word[0][1:]:<8}{name[0]:<10} {name[1]}", file=writer) # ------------------------------------------------------------------------ # atomic def snaketolower(name): return "".join(w for w in name.split("_")) def modifytype(line): return line.replace("uint64_t", "uint64").replace("uint32_t", "uint32").replace("uintptr_t","uintptr") def fmtatomic(writer, reader): def fmt(name): prefix = "" index = name.find("a_") if index > 0: prefix = name[:index] name = name[index:] name = name.replace("a_", "") return prefix + "atomic·" + snaketolower(name) for line in reader: word = [modifytype(w) for w in line.split(" ")] if len(word) == 3: if word[0].strip() == "#define" and word[1].strip() == word[2].strip(): continue if len(word) == 2: if word[0].strip() == "#define" and "libc.h" in word[1]: continue ours = [ fmt(w) if "a_" in w else w for w in word ] line = " ".join(ours) line = line.replace("static inline void ", "static inline void\n")\ .replace("static inline int ", "static inline int\n")\ .replace("if (", "if(").replace("for (", "for(").replace("while (", "while(")\ .replace("do {", "do{").replace("} while", "}while").replace(") {", "){") print(line, file=writer, end="") # ------------------------------------------------------------------------ # types stdintdefs=[ "INTPTR_MIN", "INTPTR_MAX", "UINTPTR_MIN", "UINTPTR_MAX", "PTRDIFF_MIN", "PTRDIFF_MAX", "SIZE_MAX", "LONG_MAX", ] int8 = "char" int16 = "short" int32 = "int" int64 = {'x86_64':"long",'i386':"long long",'arm':"long long",'aarch64':"long",'riscv64':"long"} addr = {'x86_64':"long",'i386':"int",'arm':"int",'aarch64':"long",'riscv64':"long"} reg = {'x86_64':"long",'i386':"int",'arm':"int",'aarch64':"long",'riscv64':"long"} long64="0x7fffffffffffffffL" long32="0x7fffffffL" maxlong={'x86_64':long64, 'i386':long32, 'arm':long32, 'aarch64': long64, 'riscv64': long64} def fmtstdint(reader, writer): for line in reader: for intdef in stdintdefs: if intdef in line: word = line.split() if word[0] != "#define": continue putdefine(writer, word[1], word[2]) def fmtbits(writer, musl, arch): putheader(writer) putline(writer) putcomment(writer, "base types") putdefine(writer, "INT8", int8) putdefine(writer, "INT16", int16) putdefine(writer, "INT32", int32) putdefine(writer, "INT64", int64[arch]) putdefine(writer, "ADDR", addr[arch]) putdefine(writer, "REG", reg[arch]) putline(writer) putcomment(writer, "abi") limits = f"{musl}/bits/limits.h" if os.path.exists(limits): with open(limits) as reader: for line in reader: word = line.split() putdefine(writer, word[1], word[2]) else: print(f"----> not found {limits}", file=stderr) with open(f"{musl}/bits/stdint.h") as io: fmtstdint(io, writer) putdefine(writer, "LONG_MAX", maxlong[arch]) # ------------------------------------------------------------------------ # main point of entry import os from sys import stdout, stderr from os import makedirs as mkpath def putheader(writer): print("#pragma once", file=writer) def putline(writer): print("", file=writer) def putcomment(writer, string): print(f"/* {string} */", file=writer) def putdefine(writer, name, value, left=20, right=0): print(f"#define {name:<{left}} {value:<{right}}", file=writer) muslroot = "../vendor/musl" if __name__ == "__main__": # architecture specifics for source,target in arch.items(): if source == "generic": continue print(f"> for {target}",file=stderr) musl = f"{muslroot}/arch/{source}" libn = f"linux/{target}" libnarch = f"{libn}/arch" mkpath(libnarch, exist_ok=True) print("--> emitting types", file=stderr) with open(f"{libnarch}/bits.h", "w") as io: fmtbits(io, musl, source) print("--> emitting atomics", file=stderr) with open(f"{musl}/atomic_arch.h") as reader, open(f"{libnarch}/atomic.h", "w") as writer: fmtatomic(writer,reader) print("--> emitting syscalls", file=stderr) with open(f"{libn}/syscall.h", "w") as io: fmtsyscall(io,musl) # TODO: generalize away from linux # portable operating system print("--> emitting errno", file=stderr) source, target = "generic", "port" musl = f"{muslroot}/arch/{source}" libn = f"linux/{target}/os" mkpath(libnarch, exist_ok=True) with open(f"{libn}/errno.h","w") as io: fmterrno(io, musl) musl = f"{muslroot}/src/errno" libn = "linux/src/errno.inc.h" with open(libn,"w") as io: fmtstrerror(io, musl) libn = "linux/src/internal.h" musl = f"{muslroot}/src/internal/syscall.h" needfix = False with open(libn,"r") as file: line = file.readlines()[-3:] if "/*********************************************" in line[0]: if "* XXX: below this line is computer generated" in line[1]: if "*********************************************/" in line[2]: needfix = True if needfix: print("--> appending legacy syscall fixup", file=stderr) with open(libn,"a") as io: fmtsyscallold(io, musl)