npyfile: Start implementing Writer, add save/load convenience

jonnor · jonnor · commit 6c2e11620878 · 2024-08-11T15:28:33.000+02:00
diff --git a/benchmarks/iir/npyfile.py b/benchmarks/iir/npyfile.py
@@ -10,6 +10,8 @@
 import struct 
 import array
 
+NPY_MAGIC = b'\x93NUMPY'
+
 format_mapping = {
     # npy format => (array.array typecode, itemsize in bytes)
     b'f8': ('d', 8),
@@ -23,9 +25,23 @@ def find_section(data, prefix, suffix):
     section = data[start:end]
     return section
 
-def array_from_bytes(typecode, buffer):
-    # Workaround due
-    return array.array(typecode, buffer)
+def array_tobytes_generator(arr):
+    # array.array.tobytes() is missing in MicroPython =/
+    typecode = array_typecode(arr)
+    for item in arr: 
+        buf = struct.pack(typecode, item)
+        yield buf
+
+
+def array_typecode(arr):
+    typecode = str(arr)[7:8]
+    return typecode
+
+def compute_items(shape):
+    total_items = 1
+    for d in shape:
+        total_items *= d
+    return total_items
 
 class Reader():
 
@@ -38,7 +54,6 @@ def __init__(self, filelike, header_maxlength=16*10):
 
         self.header_maxlength = header_maxlength
 
-
     def close(self):
         if self.file:
             self.file.close()
@@ -57,22 +72,21 @@ def _read_header(self):
         data = self.file.read(self.header_maxlength)
 
         # Check magic
-        npy_magic = b'\x93NUMPY'
-        magic = data[0:len(npy_magic)] 
-        assert magic == npy_magic, magic
+        magic = data[0:len(NPY_MAGIC)] 
+        assert magic == NPY_MAGIC, magic
 
         # Check version
-        major, minor = struct.unpack_from('BB', data, len(npy_magic))
+        major, minor = struct.unpack_from('BB', data, len(NPY_MAGIC))
         if major == 0x01:
-            header_length = struct.unpack_from('<H', data, len(npy_magic)+2)[0]
-            header_start = len(npy_magic)+2+2
+            header_length = struct.unpack_from('<H', data, len(NPY_MAGIC)+2)[0]
+            header_start = len(NPY_MAGIC)+2+2
         elif major == 0x02:
-            header_length = struct.unpack_from('<I', data, len(npy_magic)+2)[0]
-            header_start = len(npy_magic)+2+2
+            header_length = struct.unpack_from('<I', data, len(NPY_MAGIC)+2)[0]
+            header_start = len(NPY_MAGIC)+2+4
         else:
             raise ValueError("Unsupported npy format version")
 
-        #print('hs', header_start, data[header_start:header_start+header_length])
+        print('hs', header_start, data[header_start:header_start+header_length])
 
         # Parse header info
         type_info = find_section(data, b"'descr': '", b"',")
@@ -104,10 +118,7 @@ def _read_header(self):
     def read_data_chunks(self, chunksize):
 
         # determine amount of data expected
-        total_items = 1
-        for d in self.shape:
-            total_items *= d
-        total_data_bytes = self.itemsize * total_items
+        total_data_bytes = self.itemsize * compute_items(self.shape)
 
         # read the data
         self.file.seek(self.data_start)
@@ -116,19 +127,140 @@ def read_data_chunks(self, chunksize):
         read_bytes = 0
         while read_bytes < total_data_bytes:
             sub = self.file.read(chunksize_bytes)
-            arr = array_from_bytes(self.typecode, sub)
+            arr = array.array(self.typecode, sub)
             yield arr
             read_bytes += len(sub)
 
-def test_simple():
+
+class Writer():
+    def __init__(self, filelike, shape, typecode):
+
+        if isinstance(filelike, str):
+            self.file = open(filelike, 'wb')
+        else:
+            self.file = filelike
+
+        self.typecode = typecode
+        self.shape = shape
+
+    def close(self):
+        if self.file:
+            self.file.close()
+        self.file = None
+
+    def __enter__(self):
+        self._write_header()        
+        return self
+    
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        self.close()
+
+    def _write_header(self):
+        shape = self.shape
+        typecode = self.typecode
+
+        # Sanity checking
+        dimensions = len(shape)
+        assert dimensions >= 1, dimensions
+        assert dimensions <= 5, dimensions
+
+        # Construct header info
+        dtype = '<f4' # FIXME: unhardcode
+        shape_str = ','.join((str(d) for d in shape))
+
+        header = f"{{'descr': '{dtype}', 'fortran_order': False, 'shape': ({shape_str}), }}"
+        
+        # Padded to ensure data start is aligened to 16 bytes
+        data_start = len(NPY_MAGIC)+2+2+len(header)
+        padding = 16-(data_start % 16)
+        header = header + (' ' * padding)
+        header_length = len(header)
+        data_start = len(NPY_MAGIC)+2+2+len(header)
+        assert data_start % 16 == 0, data_start
+
+        self.file.write(NPY_MAGIC)
+        self.file.write(bytes([0x01, 0x00])) # version
+        self.file.write(struct.pack('<H', header_length))
+        header_data = header.encode('ascii')
+        assert len(header_data) == len(header)
+        self.file.write(header_data)
+
+        # ready to write data
+
+    def write_values(self, arr):
+        input_typecode = array_typecode(arr)
+        assert input_typecode == self.typecode, (input_typecode, self.typecode)
+
+        for buf in array_tobytes_generator(arr):
+            self.file.write(buf)
+
+
+def load(filelike) -> tuple[tuple, array.array]:
+    """
+    Load array from .npy file
+
+    Convenience function for doing it in one shot.
+    For streaming, use npyfile.Reader instead
+    """    
+
+    chunks = []
+    with Reader(filelike) as reader:
+        # Just read everything in one chunk
+        total_items = compute_items(reader.shape)
+        for c in reader.read_data_chunks(total_items):
+            chunks.append(c)
+
+    assert len(chunks) == 1
+    return reader.shape, chunks[0]
+
+def save(filelike, arr : array.array, shape=None):
+    """
+    Save array as .npy file
+
+    Convenience function for doing it in one shot.
+    For streaming, use npyfile.Writer instead
+    """
+
+    if shape is None:
+        # default to 1d
+        shape = (len(arr), )        
+
+    typecode = array_typecode(arr)
+    total = compute_items(shape)
+    assert total == len(arr), (shape, total, len(arr))
+
+    with Writer(filelike, shape, typecode) as f:
+        f.write_values(arr)
+
+
+def test_reader_simple():
 
     with Reader('benchmarks/iir/noise.npy') as reader:
         print(reader.shape, reader.typecode, reader.itemsize)
 
-        for s in reader.read_data_chunks(100):
-            print(s)
+        for s in reader.read_data_chunks(500):
+            print(len(s))
+
+
+def test_writer_simple():
+    
+    size = 100
+    arr = array.array('f', (i for i in range(size)))
+    shape = (size, )
+
+    path = 'out.npy'
+
+    # can be saved successfully
+    save(path, arr, shape=shape)
+
+    # can be loaded back up again
+    loaded_shape, loaded_arr = load(path)
+    assert loaded_shape == shape
+    assert list(arr) == list(loaded_arr)
+
 
-test_simple()
+test_reader_simple()
+test_writer_simple()
 
     # testcases
     # supported