#!/usr/bin/env python

import re
import gzip
from math import log

# supported operating systems/architectures
operating_system = [ 'linux' ]
arch = {
    'generic' : 'port',
    'x86_64'  : 'amd64',
    'i386'    : 'i386',
    'arm'     : 'arm',
    'aarch64' : 'arm64',
    'riscv64' : 'riscv64',
}

# all dictionaries take form: musl => us

# ------------------------------------------------------------------------
# errno

# name conversions
errprefix = "sys·Error"
errno = {
    "EPERM"            :  "NotPermit",        "ENOENT"           :  "NoEntry",
    "ESRCH"            :  "NoProc",           "EINTR"            :  "Interrupt",
    "EIO"              :  "IO",               "ENXIO"            :  "NoDevAddr",
    "E2BIG"            :  "ArgLen",           "ENOEXEC"          :  "ExecFmt",
    "EBADF"            :  "BadFd",            "ECHILD"           :  "NoChild",
    "EAGAIN"           :  "Again",            "ENOMEM"           :  "NoMemory",
    "EACCES"           :  "Access",           "EFAULT"           :  "BadAddr",
    "ENOTBLK"          :  "NotBlk",           "EBUSY"            :  "Busy",
    "EEXIST"           :  "Exists",           "EXDEV"            :  "ExDevice",
    "ENODEV"           :  "NoDevice",         "ENOTDIR"          :  "NotDir",
    "EISDIR"           :  "IsDir",            "EINVAL"           :  "BadArg",
    "ENFILE"           :  "NoFiles",          "EMFILE"           :  "NoFd",
    "ENOTTY"           :  "NotTTY",           "ETXTBSY"          :  "TxtBusy",
    "EFBIG"            :  "BigFile",          "ENOSPC"           :  "NoSpace",
    "ESPIPE"           :  "NoSeek",           "EROFS"            :  "ReadOnly",
    "EMLINK"           :  "NoLink",           "EPIPE"            :  "BadPipe",
    "EDOM"             :  "BadDomain",        "ERANGE"           :  "BadResult",
    "EDEADLK"          :  "Deadlock",         "ENAMETOOLONG"     :  "NameLen",
    "ENOLCK"           :  "NoLock",           "ENOSYS"           :  "NoSyscall",
    "ENOTEMPTY"        :  "NotEmptyDir",      "ELOOP"            :  "LinkLoop",
    "EWOULDBLOCK"      :  "Blocks",           "ENOMSG"           :  "NoMsg",
    "EIDRM"            :  "NoIdent",          "ECHRNG"           :  "BadChannel",
    "EL2NSYNC"         :  "Lvl2NoSync",       "EL3HLT"           :  "Lvl3Halt",
    "EL3RST"           :  "Lvl3Reset",        "ELNRNG"           :  "LongChannel",
    "EUNATCH"          :  "NoDriver",         "ENOCSI"           :  "NoCSI",
    "EL2HLT"           :  "Lvl2Halt",         "EBADE"            :  "BadExchange",
    "EBADR"            :  "BadRequest",       "EXFULL"           :  "FullExchange",
    "ENOANO"           :  "NoAnode",          "EBADRQC"          :  "BadRequestCode",
    "EBADSLT"          :  "BadSlot",          "EDEADLOCK"        :  "Deadlock2",
    "EBFONT"           :  "BadFont",          "ENOSTR"           :  "NoStr",
    "ENODATA"          :  "NoData",           "ETIME"            :  "Timeout",
    "ENOSR"            :  "NoStream",         "ENONET"           :  "NoNet",
    "ENOPKG"           :  "NoPkg",            "EREMOTE"          :  "IsRemote",
    "ENOLINK"          :  "DeadLink",         "EADV"             :  "Adv",
    "ESRMNT"           :  "Srmnt",            "ECOMM"            :  "Communicate",
    "EPROTO"           :  "Protocol",         "EMULTIHOP"        :  "Multihop",
    "EDOTDOT"          :  "DotDot",           "EBADMSG"          :  "BadMsg",
    "EOVERFLOW"        :  "Overflow",         "ENOTUNIQ"         :  "NotUnique",
    "EBADFD"           :  "BadFdState",        "EREMCHG"          :  "NewAddress",
    "ELIBACC"          :  "ShlibAccess",      "ELIBBAD"          :  "BadShlib",
    "ELIBSCN"          :  "BadLibCode",       "ELIBMAX"          :  "ShlibLen",
    "ELIBEXEC"         :  "SlibExec",         "EILSEQ"           :  "BadWideChar",
    "ERESTART"         :  "Restart",          "ESTRPIPE"         :  "StreamPipe",
    "EUSERS"           :  "UserLen",          "ENOTSOCK"         :  "NotSocket",
    "EDESTADDRREQ"     :  "NeedDstAddr",      "EMSGSIZE"         :  "MsgLen",
    "EPROTOTYPE"       :  "BadProtocol",      "ENOPROTOOPT"      :  "NoProtocol",
    "EPROTONOSUPPORT"  :  "NoProtocolSupport","ESOCKTNOSUPPORT"  :  "NoSocketSupport",
    "EOPNOTSUPP"       :  "ProtocolSupport",  "ENOTSUP"          :  "NoSupport",
    "EPFNOSUPPORT"     :  "NoPFSupport",      "EAFNOSUPPORT"     :  "NoAFSupport",
    "EADDRINUSE"       :  "AddrInUse",        "EADDRNOTAVAIL"    :  "AddrNotAvail",
    "ENETDOWN"         :  "NetDown",          "ENETUNREACH"      :  "NetHang",
    "ENETRESET"        :  "NetReset",         "ECONNABORTED"     :  "AbortConnect",
    "ECONNRESET"       :  "ResetConnect",     "ENOBUFS"          :  "NoBuffer",
    "EISCONN"          :  "IsConnect",        "ENOTCONN"         :  "NotConnect",
    "ESHUTDOWN"        :  "Shutdown",         "ETOOMANYREFS"     :  "RefsLen",
    "ETIMEDOUT"        :  "TimedOut",         "ECONNREFUSED"     :  "RefuseConnect",
    "EHOSTDOWN"        :  "HostDown",         "EHOSTUNREACH"     :  "HostHang",
    "EALREADY"         :  "AlreadyConnect",   "EINPROGRESS"      :  "InProgress",
    "ESTALE"           :  "Stale",            "EUCLEAN"          :  "NeedClean",
    "ENOTNAM"          :  "NotNamed",         "ENAVAIL"          :  "NotAvail",
    "EISNAM"           :  "IsNamed",          "EREMOTEIO"        :  "RemoteIO",
    "EDQUOT"           :  "DiskQuota",        "ENOMEDIUM"        :  "NoMedium",
    "EMEDIUMTYPE"      :  "BadMedium",        "ECANCELED"        :  "Cancel",
    "ENOKEY"           :  "NoKey",            "EKEYEXPIRED"      :  "ExpireKey",
    "EKEYREVOKED"      :  "RevokeKey",        "EKEYREJECTED"     :  "RejectKey",
    "EOWNERDEAD"       :  "OwnerDie",         "ENOTRECOVERABLE"  :  "NoRecover",
    "ERFKILL"          :  "RFKill",           "EHWPOISON"        :  "HWPoison",
}

def errwrap(name):
    return errprefix+errno[name]

def fmterrno(writer, musl):
    with open(f"{musl}/bits/errno.h", "r") as reader:
        for line in reader:
            word = line.split()
            if word[0] != "#define":
                continue
            try:
                if word[2].isdigit():
                    putdefine(writer, errwrap(word[1]), word[2], left=30, right=0)
                else:
                    putdefine(writer, errwrap(word[1]), errwrap(word[2]), left=30, right=0)
            except:
                raise KeyError(f"{word[1]} not recognized")

def fmtstrerror(writer, musl):
    with open(f"{musl}/__strerror.h", "r") as reader:
        for line in reader:
            if len(line) == 0 or line[0] != 'E':
                continue
            word = line[2:-2].split(",")
            if word[0] == '0':
                name = word[0]
            else:
                name = errwrap(word[0])
            name += ','
            name = 'E(' + name
            mesg = "".join(w.strip() for w in word[1:]).lower() + ')'
            print(f"{name:<35}{mesg}", file=writer)

# ------------------------------------------------------------------------
# syscalls

syscallprefix = "·"

class LanguageModel(object):
  def __init__(self, word_file):
    self.word_file = word_file

    # Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
    with gzip.open(word_file) as f:
      words = f.read().decode().split()
    self._wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
    self._maxword = max(len(x) for x in words)

  def add_word(self, w):
    num_words = len(self._wordcost) + 1
    self._wordcost[w] = 0 #log((num_words)*log(num_words))
    self._maxword = max(self._maxword, len(w))

  def del_word(self, w):
      if w in self._wordcost:
          self._wordcost[w] = 1000000000

  def split(self, s):
    """Uses dynamic programming to infer the location of spaces in a string without spaces."""
    l = [self._split(x) for x in REGEXP.split(s)]
    return [item for sublist in l for item in sublist]

  def _split(self, s):
    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    def best_match(i):
      candidates = enumerate(reversed(cost[max(0, i-self._maxword):i]))
      return min((c + self._wordcost.get(s[i-k-1:i].lower(), 9e999), k+1) for k,c in candidates)

    # Build the cost array.
    cost = [0]
    for i in range(1,len(s)+1):
      c,k = best_match(i)
      cost.append(c)

    # Backtrack to recover the minimal-cost string.
    out = []
    i = len(s)
    while i>0:
      c,k = best_match(i)
      assert c == cost[i]
      # Apostrophe and digit handling (added by Genesys)
      newToken = True
      if not s[i-k:i] == "'": # ignore a lone apostrophe
        if len(out) > 0:
          # re-attach split 's and split digits
          if out[-1] == "'s" or (s[i-1].isdigit() and out[-1][0].isdigit()): # digit followed by digit
            out[-1] = s[i-k:i] + out[-1] # combine current token with previous token
            newToken = False
      # (End of Genesys addition)

      if newToken:
        out.append(s[i-k:i])

      i -= k

    return reversed(out)

DEFAULT = LanguageModel("../data/words.txt.gz")
REGEXP  = re.compile("[^a-zA-Z0-9']+")

def splitwords(s):
  return DEFAULT.split(s)

for w in ["pi", "sid"]:
    DEFAULT.del_word(w)

for w in [
    "clock", "idle", "uring", "shm", "sem", "set", "ctl", "cntl",
    "remap", "unmap", "sched", "fd", "io", "pid",
    "recv", "chmod", "chown", "lock", "stat", "sys",
    "async", "brk", "id", "unlock", "cwd", "syms",
    "attr", "chroot", "chdir", "mkdir", "rmdir",
    "gid", "uid", "serv", "acct", "unlink", "futex"]:
    DEFAULT.add_word(w)

corpus = {
    "msgsnd" : "msgsend",  "msgrcv" : "msgrecv", 
    "creat"  : "create", "mincore" : "mloaded",
}

def syscall(musl, prefix="__NR_"):
    ours = musl.removeprefix(prefix)

    for source,target in corpus.items():
        if target not in ours:
            ours = ours.replace(source,target)

    return syscallprefix+"".join(word.title() for word in splitwords(ours))

def fmtsysnum(reader, writer):
    for line in reader:
        word = line.split()
        if len(word) != 3 or word[0] != "#define":
            continue

        call = word[1]
        sysn = word[2]
        head = syscall(call)
        if sysn.isdigit():
            print(f"#define {head:<30} {sysn}", file=writer)

def fmtsysasm(reader, writer):
    putline(writer)
    putcomment(writer, "assembly routines")
    putdefine(writer, "asm", "__asm__ __volatile__", left=5)
    for line in reader:
        if line.startswith("#define __SYSCALL_LL")      \
        or line.startswith("((union")                   \
        or line.startswith("#define SYSCALL_IPC")       \
        or line.startswith("#define SYSCALL_FADVISE")   \
        or line.startswith("#define VDSO")              \
        or line.startswith("#define IPC")               \
        or line.startswith("/*")                        \
        or line.strip().startswith("*"):
            continue

        line = line.replace("__syscall", "_syscall")
        line = line.replace("__inline", "inline")
        line = line.replace("__asm__ __volatile__ ", "asm")
        line = line.replace("\t", "    ")
        line = line.replace("unsigned long", "ulong")
        line = line.replace("static inline ", "static inline\n")
        print(line, file=writer, end="")

def fmtsyscall(writer, musl):
    with open(f"{musl}/bits/syscall.h.in") as io:
        fmtsysnum(io, writer)

    with open(f"{musl}/syscall_arch.h") as io:
        fmtsysasm(io, writer)

def fmtsyscallold(writer, musl):
    with open(musl,"r") as reader:
        needend=False
        for line in reader:
            if line.startswith("#define SYS_") \
            or line.startswith("#undef SYS_")  \
            or line.startswith("#ifdef SYS_")  \
            or line.startswith("#ifndef SYS_") \
            or(line.startswith("#endif") and needend):
                word = line.split()
                name = [syscall(w,prefix="SYS_") for w in word[1:]]
                if word[0] == "#endif":
                    print(word[0], file=writer)
                    putline(writer)
                    needend=False
                elif word[0] == "#ifdef" or word[0] == "#ifndef":
                    needend=True
                    print(word[0], name[0], file=writer)
                else:
                    if word[0] == "#undef":
                        print(f"#    {word[0][1:]:<8}{name[0]}", file=writer)
                    else:
                        print(f"#    {word[0][1:]:<8}{name[0]:<10} {name[1]}", file=writer)

# ------------------------------------------------------------------------
# atomic

def snaketolower(name):
    return "".join(w for w in name.split("_"))

def modifytype(line):
    return line.replace("uint64_t", "uint64").replace("uint32_t", "uint32").replace("uintptr_t","uintptr")

def fmtatomic(writer, reader):
    def fmt(name):
        prefix = ""
        index = name.find("a_")
        if index > 0:
            prefix = name[:index]

        name = name[index:]
        name = name.replace("a_", "")
        return prefix + "atomic·" + snaketolower(name)

    for line in reader:
        word = [modifytype(w) for w in line.split(" ")]
        if len(word) == 3:
            if word[0].strip() == "#define" and word[1].strip() == word[2].strip():
                continue
        if len(word) == 2:
            if word[0].strip() == "#define" and "libc.h" in word[1]:
                continue

        ours = [ fmt(w) if "a_" in w else w for w in word ]
        line = " ".join(ours)
        line = line.replace("static inline void ", "static inline void\n")\
                   .replace("static inline int ", "static inline int\n")\
                   .replace("if (", "if(").replace("for (", "for(").replace("while (", "while(")\
                   .replace("do {", "do{").replace("} while", "}while").replace(") {", "){")

        print(line, file=writer, end="")

# ------------------------------------------------------------------------
# types

stdintdefs=[
    "INTPTR_MIN",  "INTPTR_MAX",  "UINTPTR_MIN", "UINTPTR_MAX",
    "PTRDIFF_MIN", "PTRDIFF_MAX", "SIZE_MAX",    "LONG_MAX",
]

int8  = "char"
int16 = "short"
int32 = "int"
int64 = {'x86_64':"long",'i386':"long long",'arm':"long long",'aarch64':"long",'riscv64':"long"}
addr  = {'x86_64':"long",'i386':"int",'arm':"int",'aarch64':"long",'riscv64':"long"}
reg   = {'x86_64':"long",'i386':"int",'arm':"int",'aarch64':"long",'riscv64':"long"}

long64="0x7fffffffffffffffL"
long32="0x7fffffffL"
maxlong={'x86_64':long64, 'i386':long32, 'arm':long32, 'aarch64': long64, 'riscv64': long64}

def fmtstdint(reader, writer):
    for line in reader:
        for intdef in stdintdefs:
            if intdef in line:
                word = line.split()
                if word[0] != "#define":
                    continue
                putdefine(writer, word[1], word[2])

def fmtbits(writer, musl, arch):
    putheader(writer)
    putline(writer)
    putcomment(writer, "base types")

    putdefine(writer, "INT8",  int8)
    putdefine(writer, "INT16", int16)
    putdefine(writer, "INT32", int32)
    putdefine(writer, "INT64", int64[arch])
    putdefine(writer, "ADDR",  addr[arch])
    putdefine(writer, "REG",   reg[arch])

    putline(writer)
    putcomment(writer, "abi")

    limits = f"{musl}/bits/limits.h"
    if os.path.exists(limits):
        with open(limits) as reader:
            for line in reader:
                word = line.split()
                putdefine(writer, word[1], word[2])
    else:
        print(f"----> not found {limits}", file=stderr)

    with open(f"{musl}/bits/stdint.h") as io:
        fmtstdint(io, writer)

    putdefine(writer, "LONG_MAX", maxlong[arch])

# ------------------------------------------------------------------------
# main point of entry

import os

from sys import stdout, stderr
from os import makedirs as mkpath

def putheader(writer):
    print("#pragma once", file=writer)

def putline(writer):
    print("", file=writer)

def putcomment(writer, string):
    print(f"/* {string} */", file=writer)

def putdefine(writer, name, value, left=20, right=0):
    print(f"#define {name:<{left}} {value:<{right}}", file=writer)

muslroot = "../vendor/musl"
if __name__ == "__main__":
    # architecture specifics
    for source,target in arch.items():
        if source == "generic":
            continue

        print(f"> for {target}",file=stderr)

        musl = f"{muslroot}/arch/{source}"

        libn = f"linux/{target}"
        libnarch = f"{libn}/arch"
        mkpath(libnarch, exist_ok=True)

        print("--> emitting types", file=stderr)
        with open(f"{libnarch}/bits.h", "w") as io:
            fmtbits(io, musl, source)

        print("--> emitting atomics", file=stderr)
        with open(f"{musl}/atomic_arch.h") as reader, open(f"{libnarch}/atomic.h", "w") as writer:
            fmtatomic(writer,reader)

        print("--> emitting syscalls", file=stderr)
        with open(f"{libn}/syscall.h", "w") as io:
            fmtsyscall(io,musl)

    # TODO: generalize away from linux
    # portable operating system
    print("--> emitting errno", file=stderr)
    source, target = "generic", "port"

    musl = f"{muslroot}/arch/{source}"
    libn = f"linux/{target}/os"
    mkpath(libnarch, exist_ok=True)
    with open(f"{libn}/errno.h","w") as io:
        fmterrno(io, musl)

    musl = f"{muslroot}/src/errno"
    libn = "linux/src/errno.inc.h"
    with open(libn,"w") as io:
        fmtstrerror(io, musl)

    libn = "linux/src/internal.h"
    musl = f"{muslroot}/src/internal/syscall.h"

    needfix = False
    with open(libn,"r") as file:
        line = file.readlines()[-3:]
        if "/*********************************************" in line[0]:
            if "* XXX: below this line is computer generated" in line[1]:
                if "*********************************************/" in line[2]:
                    needfix = True

    if needfix:
        print("--> appending legacy syscall fixup", file=stderr)
        with open(libn,"a") as io:
            fmtsyscallold(io, musl)