tools/convert-unreal-*: Parse input in linear time

Non-trivial string slicing on CPython makes a copy of the string, making
the overall parsing run in quadratic time.

For example, assuming an average field size of 10 bytes, parsing a 1MB
file on my computer would take 70s in slicing alone.
This commit is contained in:
Valentin Lorentz 2024-06-11 19:55:25 +02:00 committed by Sadie Powell
parent 19b389468c
commit 28e2f30525
2 changed files with 15 additions and 31 deletions

View File

@ -18,6 +18,7 @@
#
import io
import os
import sys
@ -29,36 +30,28 @@ class UnrealDB:
with open(sys.argv[1], mode="rb") as fh:
data = fh.read()
if data.startswith(b"UnrealIRCd-DB-v1"):
self.data = data[40:]
self.data = io.BytesIO(data[40:])
elif data.startswith(b"UnrealIRCd-DB"):
self.error = f"Unsupported database version: {data[0:32]}"
else:
self.data = data
self.data = io.BytesIO(data)
except OSError as e:
self.error = f"Read error: {e}"
def read_i16(self):
tmp = int.from_bytes(self.data[0:2], byteorder="little")
self.data = self.data[2:]
return tmp
return int.from_bytes(self.data.read(2), byteorder="little")
def read_i32(self):
tmp = int.from_bytes(self.data[0:4], byteorder="little")
self.data = self.data[4:]
return tmp
return int.from_bytes(self.data.read(4), byteorder="little")
def read_i64(self):
tmp = int.from_bytes(self.data[0:8], byteorder="little")
self.data = self.data[8:]
return tmp
return int.from_bytes(self.data.read(8), byteorder="little")
def read_str(self):
len = self.read_i16()
if len == 0 or len == 0xFFFF:
return ""
tmp = self.data[0:len]
self.data = self.data[len:]
return str(tmp, "utf-8")
return self.data.read(len).decode("utf-8")
def error(msg):

View File

@ -18,6 +18,7 @@
#
import io
import os
import sys
@ -29,41 +30,31 @@ class UnrealDB:
with open(sys.argv[1], mode="rb") as fh:
data = fh.read()
if data.startswith(b"UnrealIRCd-DB-v1"):
self.data = data[40:]
self.data = io.BytesIO(data[40:])
elif data.startswith(b"UnrealIRCd-DB"):
self.error = f"Unsupported database version: {data[0:32]}"
else:
self.data = data
self.data = io.BytesIO(data)
except OSError as e:
self.error = f"Read error: {e}"
def read_char(self):
tmp = self.data[0:1]
self.data = self.data[1:]
return str(tmp, "utf-8")
return self.data.read(1).decode("utf-8")
def read_i16(self):
tmp = int.from_bytes(self.data[0:2], byteorder="little")
self.data = self.data[2:]
return tmp
return int.from_bytes(self.data.read(2), byteorder="little")
def read_i32(self):
tmp = int.from_bytes(self.data[0:4], byteorder="little")
self.data = self.data[4:]
return tmp
return int.from_bytes(self.data.read(4), byteorder="little")
def read_i64(self):
tmp = int.from_bytes(self.data[0:8], byteorder="little")
self.data = self.data[8:]
return tmp
return int.from_bytes(self.data.read(8), byteorder="little")
def read_str(self):
len = self.read_i16()
if len == 0 or len == 0xFFFF:
return ""
tmp = self.data[0:len]
self.data = self.data[len:]
return str(tmp, "utf-8")
return self.data.read(len).decode("utf-8")
def error(msg):