diff --git a/.github/workflows/py.yml b/.github/workflows/py.yml index 685671d8..dd9b7117 100644 --- a/.github/workflows/py.yml +++ b/.github/workflows/py.yml @@ -19,7 +19,9 @@ jobs: - name: Run tests run: | cd flatdata-py - uv run --with pytest --with ../flatdata-generator pytest -v - pip install . - flatdata-inspector --help + uv venv + uv pip install ../flatdata-generator + uv pip install ".[inspector]" pytest + .venv/bin/pytest -v + .venv/bin/flatdata-inspector --help diff --git a/flatdata-generator/flatdata/generator/templates/py/python.jinja2 b/flatdata-generator/flatdata/generator/templates/py/python.jinja2 index 44655970..9be40870 100644 --- a/flatdata-generator/flatdata/generator/templates/py/python.jinja2 +++ b/flatdata-generator/flatdata/generator/templates/py/python.jinja2 @@ -10,6 +10,7 @@ import flatdata.lib as flatdata {{ struct.doc|to_python_doc}} class {{ tree.namespace_path(struct, "_") }}_{{ struct.name }}(flatdata.structure.Structure): """{{ struct.doc|safe_py_string_line }}""" + __slots__ = () _SCHEMA = """{{ tree.schema(struct) }}""" _NAME = "{{ tree.namespace_path(struct, "_") }}_{{ struct.name }}" _SIZE_IN_BITS = {{ struct.size_in_bits }} diff --git a/flatdata-generator/pyproject.toml b/flatdata-generator/pyproject.toml index 71861794..cac2c360 100644 --- a/flatdata-generator/pyproject.toml +++ b/flatdata-generator/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "flatdata-generator" -version = "0.4.10" +version = "0.4.11" description = "Generate source code for C++, Rust, Go or Python from a Flatdata schema file" readme = "README.md" authors = [ diff --git a/flatdata-generator/tests/generators/py_expectations/archives/multivector.py b/flatdata-generator/tests/generators/py_expectations/archives/multivector.py index b53dfd7f..2aeba51e 100644 --- a/flatdata-generator/tests/generators/py_expectations/archives/multivector.py +++ b/flatdata-generator/tests/generators/py_expectations/archives/multivector.py @@ -1,5 +1,6 @@ class n_S(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct S { @@ -20,6 +21,7 @@ class n_S(flatdata.structure.Structure): class n_T(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct T { @@ -40,6 +42,7 @@ class n_T(flatdata.structure.Structure): # Builtin type to for MultiVector index class n__builtin_multivector_IndexType8(flatdata.structure.Structure): """/** Builtin type to for MultiVector index */""" + __slots__ = () _SCHEMA = """""" _NAME = "n__builtin_multivector_IndexType8" _SIZE_IN_BITS = 8 @@ -53,6 +56,7 @@ class n__builtin_multivector_IndexType8(flatdata.structure.Structure): # Builtin type to for MultiVector index class n__builtin_multivector_IndexType16(flatdata.structure.Structure): """/** Builtin type to for MultiVector index */""" + __slots__ = () _SCHEMA = """""" _NAME = "n__builtin_multivector_IndexType16" _SIZE_IN_BITS = 16 @@ -66,6 +70,7 @@ class n__builtin_multivector_IndexType16(flatdata.structure.Structure): # Builtin type to for MultiVector index class n__builtin_multivector_IndexType64(flatdata.structure.Structure): """/** Builtin type to for MultiVector index */""" + __slots__ = () _SCHEMA = """""" _NAME = "n__builtin_multivector_IndexType64" _SIZE_IN_BITS = 64 diff --git a/flatdata-generator/tests/generators/py_expectations/archives/namespaces.py b/flatdata-generator/tests/generators/py_expectations/archives/namespaces.py index 2a731087..9e471683 100644 --- a/flatdata-generator/tests/generators/py_expectations/archives/namespaces.py +++ b/flatdata-generator/tests/generators/py_expectations/archives/namespaces.py @@ -1,5 +1,6 @@ class n_S(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct S { @@ -92,6 +93,7 @@ def __init__(self, resource_storage): class m_S(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace m { struct S { @@ -184,6 +186,7 @@ def __init__(self, resource_storage): # Builtin type to for MultiVector index class a__builtin_multivector_IndexType32(flatdata.structure.Structure): """/** Builtin type to for MultiVector index */""" + __slots__ = () _SCHEMA = """""" _NAME = "a__builtin_multivector_IndexType32" _SIZE_IN_BITS = 32 diff --git a/flatdata-generator/tests/generators/py_expectations/archives/ranges.py b/flatdata-generator/tests/generators/py_expectations/archives/ranges.py index 111c0762..c760b182 100644 --- a/flatdata-generator/tests/generators/py_expectations/archives/ranges.py +++ b/flatdata-generator/tests/generators/py_expectations/archives/ranges.py @@ -1,5 +1,6 @@ class n_S(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct S { diff --git a/flatdata-generator/tests/generators/py_expectations/archives/struct.py b/flatdata-generator/tests/generators/py_expectations/archives/struct.py index ae6a927d..62b14b88 100644 --- a/flatdata-generator/tests/generators/py_expectations/archives/struct.py +++ b/flatdata-generator/tests/generators/py_expectations/archives/struct.py @@ -1,5 +1,6 @@ class n_S(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct S { diff --git a/flatdata-generator/tests/generators/py_expectations/archives/vector.py b/flatdata-generator/tests/generators/py_expectations/archives/vector.py index 5f41860a..81e6f671 100644 --- a/flatdata-generator/tests/generators/py_expectations/archives/vector.py +++ b/flatdata-generator/tests/generators/py_expectations/archives/vector.py @@ -1,5 +1,6 @@ class n_S(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct S { diff --git a/flatdata-generator/tests/generators/py_expectations/structs/comments.py.1 b/flatdata-generator/tests/generators/py_expectations/structs/comments.py.1 index 5eb6c8dd..653496f0 100644 --- a/flatdata-generator/tests/generators/py_expectations/structs/comments.py.1 +++ b/flatdata-generator/tests/generators/py_expectations/structs/comments.py.1 @@ -1,4 +1,5 @@ # This is a comment about Foo class n_Foo(flatdata.structure.Structure): """// This is a comment about Foo""" + __slots__ = () _SCHEMA = """namespace n { \ No newline at end of file diff --git a/flatdata-generator/tests/generators/py_expectations/structs/comments.py.2 b/flatdata-generator/tests/generators/py_expectations/structs/comments.py.2 index 1e9dac4f..36adabeb 100644 --- a/flatdata-generator/tests/generators/py_expectations/structs/comments.py.2 +++ b/flatdata-generator/tests/generators/py_expectations/structs/comments.py.2 @@ -1,3 +1,4 @@ # This is a comment about Foo class n_Foo(flatdata.structure.Structure): - """// This is a comment about Foo""" \ No newline at end of file + """// This is a comment about Foo""" + __slots__ = () \ No newline at end of file diff --git a/flatdata-generator/tests/generators/py_expectations/structs/integers.py b/flatdata-generator/tests/generators/py_expectations/structs/integers.py index 0fbf8f38..99829cc1 100644 --- a/flatdata-generator/tests/generators/py_expectations/structs/integers.py +++ b/flatdata-generator/tests/generators/py_expectations/structs/integers.py @@ -12,6 +12,7 @@ class n_U8(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U8 { @@ -32,6 +33,7 @@ class n_U8(flatdata.structure.Structure): class n_I8(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I8 { @@ -52,6 +54,7 @@ class n_I8(flatdata.structure.Structure): class n_U16(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U16 { @@ -72,6 +75,7 @@ class n_U16(flatdata.structure.Structure): class n_I16(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I16 { @@ -92,6 +96,7 @@ class n_I16(flatdata.structure.Structure): class n_U32(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U32 { @@ -112,6 +117,7 @@ class n_U32(flatdata.structure.Structure): class n_I32(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I32 { @@ -132,6 +138,7 @@ class n_I32(flatdata.structure.Structure): class n_U64(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U64 { @@ -152,6 +159,7 @@ class n_U64(flatdata.structure.Structure): class n_I64(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I64 { diff --git a/flatdata-generator/tests/generators/py_expectations/structs/namespaces.py b/flatdata-generator/tests/generators/py_expectations/structs/namespaces.py index 26f2f7e1..4519677c 100644 --- a/flatdata-generator/tests/generators/py_expectations/structs/namespaces.py +++ b/flatdata-generator/tests/generators/py_expectations/structs/namespaces.py @@ -1,5 +1,6 @@ class n_Foo(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct Foo { @@ -20,6 +21,7 @@ class n_Foo(flatdata.structure.Structure): class m_Foo(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace m { struct Foo { diff --git a/flatdata-generator/tests/generators/py_expectations/structs/unaligned.py b/flatdata-generator/tests/generators/py_expectations/structs/unaligned.py index 3c19adcc..c86cd6a1 100644 --- a/flatdata-generator/tests/generators/py_expectations/structs/unaligned.py +++ b/flatdata-generator/tests/generators/py_expectations/structs/unaligned.py @@ -1,5 +1,6 @@ class n_U8(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U8 { @@ -23,6 +24,7 @@ class n_U8(flatdata.structure.Structure): class n_I8(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I8 { @@ -46,6 +48,7 @@ class n_I8(flatdata.structure.Structure): class n_U16(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U16 { @@ -69,6 +72,7 @@ class n_U16(flatdata.structure.Structure): class n_I16(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I16 { @@ -92,6 +96,7 @@ class n_I16(flatdata.structure.Structure): class n_U32(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U32 { @@ -115,6 +120,7 @@ class n_U32(flatdata.structure.Structure): class n_I32(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I32 { @@ -138,6 +144,7 @@ class n_I32(flatdata.structure.Structure): class n_U64(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct U64 { @@ -161,6 +168,7 @@ class n_U64(flatdata.structure.Structure): class n_I64(flatdata.structure.Structure): """""" + __slots__ = () _SCHEMA = """namespace n { struct I64 { diff --git a/flatdata-py/README.md b/flatdata-py/README.md index d1579a9c..7efa2d13 100644 --- a/flatdata-py/README.md +++ b/flatdata-py/README.md @@ -18,6 +18,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema), flatdata-generator --gen py --schema locations.flatdata --output-file locations.py ``` +## Performance tips + +`flatdata-py` supports two data access patterns with very different performance characteristics on large archives. + +Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations: + +```python +count = sum(1 for x in archive.links if x.speed_limit > 100) +``` + +For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays: + +```python +# single column access, returns a pandas DataFrame +df = archive.links.speed_limit +count = len(df[df['speed_limit'] > 100]) + +# full NumPy structured array with all fields +arr = archive.links.to_numpy() +count = int(np.sum(arr['speed_limit'] > 100)) + +# slices work too +arr = archive.links[1000:2000].to_numpy() +df = archive.links[::10].to_data_frame() +``` + +* Use `vector.field_name` (column access) when you only need one or a few fields. +* Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once. +* Use `vector[i].field` for random access to individual elements. +* The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM. + ## Using the inspector `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive: diff --git a/flatdata-py/flatdata/lib/archive.py b/flatdata-py/flatdata/lib/archive.py index 34e27aa2..994a2963 100644 --- a/flatdata-py/flatdata/lib/archive.py +++ b/flatdata-py/flatdata/lib/archive.py @@ -39,9 +39,9 @@ def __init__(self, resource_storage): self.__getattr__(name) def __getattr__(self, name): - if name not in list(self._RESOURCES.keys()): + if name not in self._RESOURCES: raise AttributeError("Resource %s not defined in archive." % name) - if name not in list(self._loaded_resources.keys()): + if name not in self._loaded_resources: self._loaded_resources[name] = self._open_resource(name) return self._loaded_resources[name] diff --git a/flatdata-py/flatdata/lib/data_access.py b/flatdata-py/flatdata/lib/data_access.py index 359dfc65..6ffd021a 100644 --- a/flatdata-py/flatdata/lib/data_access.py +++ b/flatdata-py/flatdata/lib/data_access.py @@ -3,6 +3,8 @@ See the LICENSE file in the root of this project for license details. ''' +import numpy as np + # Sign bits cache for the value reading. _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)] @@ -20,7 +22,7 @@ def read_value(data, offset_bits, num_bits, is_signed): remainder = data[offset_bytes + total_bytes] result |= remainder << (total_bytes * 8 - offset_extra_bits) - if num_bits < 64: + if num_bits < 64 or offset_extra_bits > 0: result = result & ((1 << num_bits) - 1) if not is_signed: @@ -62,3 +64,49 @@ def write_value(data, offset_bits, num_bits, is_signed, value): surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1) data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1) data[offset_bytes + byte_idx] |= surrounding_bits + + +def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed): + """Read a bit-packed field from all elements at once, returning a numpy array. + + :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes) + :param field_offset_bits: bit offset of the field within each element + :param field_width_bits: width of the field in bits (max 64) + :param is_signed: whether to sign-extend the result + :return: numpy array of field values + """ + if field_width_bits == 1: + byte_idx = field_offset_bits // 8 + bit_idx = field_offset_bits % 8 + return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) & + np.uint64(1)) + + byte_start = field_offset_bits // 8 + bit_shift = field_offset_bits % 8 + bytes_needed = (bit_shift + field_width_bits + 7) // 8 + + # Use Python int arithmetic for the shift to avoid numpy overflow, + # then broadcast back to the array. + result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64) + for b in range(min(bytes_needed, 8)): + result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8) + result >>= np.uint64(bit_shift) + + # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte. + bits_so_far = 8 * min(bytes_needed, 8) - bit_shift + if bits_so_far < field_width_bits and bytes_needed > 8: + extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64) + result |= extra << np.uint64(bits_so_far) + + if field_width_bits < 64: + result &= np.uint64((1 << field_width_bits) - 1) + + if is_signed: + if field_width_bits == 64: + return result.view(np.int64) + sign_bit = np.uint64(1 << (field_width_bits - 1)) + offset = -(1 << field_width_bits) + signed = result.astype(np.int64) + np.int64(offset) + result = np.where(result & sign_bit, signed, result.astype(np.int64)) + + return result diff --git a/flatdata-py/flatdata/lib/resources.py b/flatdata-py/flatdata/lib/resources.py index f371a7d2..6270b795 100644 --- a/flatdata-py/flatdata/lib/resources.py +++ b/flatdata-py/flatdata/lib/resources.py @@ -8,7 +8,7 @@ import pandas as pd import numpy as np -from .data_access import read_value +from .data_access import read_value, read_field_vectorized from .errors import CorruptResourceError SIZE_OFFSET_IN_BITS = 64 @@ -24,6 +24,7 @@ def __init__(self, mem, element_type): self._element_type = element_type self._element_types = [element_type] self._type_size_in_bytes = self._element_type._SIZE_IN_BYTES if self._element_type else 1 + self._raw_numpy_2d = None def size_in_bytes(self): return len(self._mem) @@ -35,6 +36,20 @@ def _get_item(self, index): offset = self._item_offset(index) return self._element_type(self._mem, offset) + def _as_numpy_2d(self): + """Return the raw data as a 2D numpy uint8 array of shape (n, struct_size). + Zero-copy via np.frombuffer on the mmap'd memory. Cached after first call. + """ + if self._raw_numpy_2d is None: + n = len(self) + struct_size = self._type_size_in_bytes + raw = np.frombuffer( + self._mem[SIZE_OFFSET_IN_BYTES:SIZE_OFFSET_IN_BYTES + n * struct_size], + dtype=np.uint8, + ) + self._raw_numpy_2d = raw.reshape(n, struct_size) + return self._raw_numpy_2d + def _repr_attributes(self): return { "container_type": self.__class__.__name__, @@ -60,14 +75,18 @@ def __init__(self, s, sequence): self._sequence = sequence def to_numpy(self, limit=None): - indices = self._slice.indices(len(self._sequence)) - num_items = len(range(*indices)) if not limit else limit - result = np.empty( - shape=num_items, - dtype=self._sequence._element_type.dtype() - ) - for index, item in enumerate(self): - result[index] = item.as_tuple() + raw_2d = self._sequence._as_numpy_2d() + sliced = raw_2d[self._slice] + if limit is not None: + sliced = sliced[:limit] + + fields = self._sequence._element_type._FIELDS + dtype = self._sequence._element_type.dtype() + result = np.empty(sliced.shape[0], dtype=dtype) + for name, field in fields.items(): + result[name] = read_field_vectorized( + sliced, field.offset, field.width, field.is_signed + ) return result def to_data_frame(self, limit=None): @@ -78,7 +97,13 @@ def __iter__(self): yield self._sequence[i] def __getattr__(self, name): - return pd.DataFrame(data=[[getattr(item, name)] for item in self], columns=[name]) + try: + field = self._sequence._element_type._FIELDS[name] + except KeyError: + raise AttributeError("Field %s not found in structure" % name) + raw_2d = self._sequence._as_numpy_2d()[self._slice] + values = read_field_vectorized(raw_2d, field.offset, field.width, field.is_signed) + return pd.DataFrame(data=values, columns=[name]) def __repr__(self): return "Displaying first 100 records:\n" + self.to_data_frame(limit=100).__repr__() @@ -92,8 +117,20 @@ def __init__(self, mem, element_type): assert rem == 0, "Malformed vector" self._size = size + def to_numpy(self): + """Convert entire vector to a numpy structured array (vectorized).""" + raw_2d = self._as_numpy_2d() + fields = self._element_type._FIELDS + dtype = self._element_type.dtype() + result = np.empty(self._size, dtype=dtype) + for name, field in fields.items(): + result[name] = read_field_vectorized( + raw_2d, field.offset, field.width, field.is_signed + ) + return result + def to_data_frame(self): - return self[:].to_data_frame() + return pd.DataFrame(data=self.to_numpy()) def __getitem__(self, index): if isinstance(index, slice): @@ -106,11 +143,20 @@ def __getitem__(self, index): return self._get_item(index) def __iter__(self): - for i in range(len(self)): - yield self._get_item(i) + mem = self._mem + element_type = self._element_type + size_bytes = self._type_size_in_bytes + for i in range(self._size): + yield element_type(mem, SIZE_OFFSET_IN_BYTES + size_bytes * i) def __getattr__(self, name): - return pd.DataFrame(data=[[getattr(item, name)] for item in self], columns=[name]) + try: + field = self._element_type._FIELDS[name] + except KeyError: + raise AttributeError("Field %s not found in structure" % name) + raw_2d = self._as_numpy_2d() + values = read_field_vectorized(raw_2d, field.offset, field.width, field.is_signed) + return pd.DataFrame(data=values, columns=[name]) def __len__(self): return self._size diff --git a/flatdata-py/flatdata/lib/structure.py b/flatdata-py/flatdata/lib/structure.py index fec73c02..4b19d900 100644 --- a/flatdata-py/flatdata/lib/structure.py +++ b/flatdata-py/flatdata/lib/structure.py @@ -9,6 +9,8 @@ class Structure: + __slots__ = ('_mem', '_pos') + def __init__(self, mem, pos): self._mem = mem self._pos = pos diff --git a/flatdata-py/pyproject.toml b/flatdata-py/pyproject.toml index f7adcda7..4e26d316 100644 --- a/flatdata-py/pyproject.toml +++ b/flatdata-py/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "flatdata-py" -version = "0.4.10" +version = "0.4.11" description = "Python 3 implementation of Flatdata" readme = "README.md" authors = [ @@ -16,7 +16,7 @@ classifiers = [ "Programming Language :: Python :: 3", ] dependencies = [ - "flatdata-generator==0.4.10", + "flatdata-generator==0.4.11", "numpy", "pandas", ] diff --git a/flatdata-py/tests/test_data_access.py b/flatdata-py/tests/test_data_access.py index 86af3c57..fa461858 100644 --- a/flatdata-py/tests/test_data_access.py +++ b/flatdata-py/tests/test_data_access.py @@ -1124,6 +1124,25 @@ def _test_reader(buffer, offset, num_bits, is_signed, expected): _test_reader(b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 3, 2, True, 0) +def test_reader_unaligned_64bit(): + """read_value must return at most 64 bits for a 64-bit field at a non-byte-aligned offset.""" + data = b"\xff" * 9 + # 64 bits starting at bit 1, all set → 0xFFFFFFFFFFFFFFFF + result = read_value(data, 1, 64, False) + assert result == 0xFFFFFFFFFFFFFFFF, \ + f"Expected 0xFFFFFFFFFFFFFFFF, got {result:#x} ({result.bit_length()} bits)" + + # Signed: all 1s → -1 + result_signed = read_value(data, 1, 64, True) + assert result_signed == -1 + + # Other non-byte-aligned offsets + for offset in range(1, 8): + result = read_value(data, offset, 64, False) + assert result == 0xFFFFFFFFFFFFFFFF, \ + f"offset={offset}: expected 0xFFFFFFFFFFFFFFFF, got {result:#x}" + + def test_writer(): """ Following tests were generated from C++ counterparts. Reasoning: python implementation lacks diff --git a/flatdata-py/tests/test_vectorized_access.py b/flatdata-py/tests/test_vectorized_access.py new file mode 100644 index 00000000..95eee071 --- /dev/null +++ b/flatdata-py/tests/test_vectorized_access.py @@ -0,0 +1,232 @@ +"""Tests for vectorized numpy access paths.""" + +import numpy as np +import pytest + +from flatdata.generator.engine import Engine +from flatdata.lib.data_access import read_field_vectorized +from common import ( + DictResourceStorage, + ARCHIVE_SIGNATURE_PAYLOAD, + VECTOR_TEST_SCHEMA, + RESOURCE_VECTOR_PAYLOAD, +) + + +def _make_vector_archive(): + """Create a test archive with a vector of SignedStructs.""" + module = Engine(VECTOR_TEST_SCHEMA).render_python_module() + valid_data = { + "Archive.archive": ARCHIVE_SIGNATURE_PAYLOAD, + "Archive.archive.schema": module.backward_compatibility_Archive.schema().encode(), + "resource": RESOURCE_VECTOR_PAYLOAD, + "resource.schema": module.backward_compatibility_Archive.resource_schema('resource').encode() + } + archive = module.backward_compatibility_Archive(DictResourceStorage(valid_data)) + return archive, module + + +class TestReadFieldVectorized: + """Tests for the read_field_vectorized function.""" + + def test_all_fields_match_element_access(self): + archive, module = _make_vector_archive() + vector = archive.resource + raw_2d = vector._as_numpy_2d() + + from flatdata.lib.data_access import read_field_vectorized + + for name, field in vector._element_type._FIELDS.items(): + values = read_field_vectorized( + raw_2d, field.offset, field.width, field.is_signed + ) + for i in range(len(vector)): + expected = getattr(vector[i], name) + actual = int(values[i]) + assert expected == actual, \ + f"Mismatch in {name}[{i}]: expected={expected}, actual={actual}" + + def test_signed_fields_read_correctly(self): + archive, module = _make_vector_archive() + vector = archive.resource + raw_2d = vector._as_numpy_2d() + + from flatdata.lib.data_access import read_field_vectorized + + # Field 'a' is i16:5 (signed, 5 bits), expected value: -1 + field_a = vector._element_type._FIELDS['a'] + values_a = read_field_vectorized(raw_2d, field_a.offset, field_a.width, field_a.is_signed) + assert int(values_a[0]) == -1 + assert int(values_a[1]) == -1 + + # Field 'c' is i32:7 (signed, 7 bits), expected value: -0x28 = -40 + field_c = vector._element_type._FIELDS['c'] + values_c = read_field_vectorized(raw_2d, field_c.offset, field_c.width, field_c.is_signed) + assert int(values_c[0]) == -0x28 + assert int(values_c[1]) == -0x28 + + +class TestVectorToNumpy: + """Tests for vectorized Vector.to_numpy().""" + + def test_to_numpy_matches_element_access(self): + archive, module = _make_vector_archive() + vector = archive.resource + arr = vector.to_numpy() + + assert len(arr) == len(vector) + for name in vector._element_type._FIELDS: + for i in range(len(vector)): + expected = getattr(vector[i], name) + actual = int(arr[name][i]) + assert expected == actual + + def test_to_numpy_dtype(self): + archive, module = _make_vector_archive() + vector = archive.resource + arr = vector.to_numpy() + assert arr.dtype == np.dtype(vector._element_type.dtype()) + + def test_to_data_frame(self): + archive, module = _make_vector_archive() + vector = archive.resource + df = vector.to_data_frame() + assert len(df) == len(vector) + assert list(df.columns) == list(vector._element_type._FIELDS.keys()) + + +class TestVectorSliceToNumpy: + """Tests for vectorized _VectorSlice.to_numpy().""" + + def test_slice_to_numpy(self): + archive, module = _make_vector_archive() + vector = archive.resource + s = vector[0:1] + arr = s.to_numpy() + + assert len(arr) == 1 + for name in vector._element_type._FIELDS: + expected = getattr(vector[0], name) + actual = int(arr[name][0]) + assert expected == actual + + def test_slice_to_data_frame(self): + archive, module = _make_vector_archive() + vector = archive.resource + df = vector[0:2].to_data_frame() + assert len(df) == 2 + + +class TestVectorColumnAccess: + """Tests for vectorized Vector.__getattr__ column access.""" + + def test_column_access_returns_dataframe(self): + archive, module = _make_vector_archive() + vector = archive.resource + df = vector.a + assert len(df) == len(vector) + assert 'a' in df.columns + + def test_column_values_match(self): + archive, module = _make_vector_archive() + vector = archive.resource + df = vector.b + for i in range(len(vector)): + expected = getattr(vector[i], 'b') + actual = int(df['b'].iloc[i]) + assert expected == actual + + +class TestNumpyCache: + """Tests for the _as_numpy_2d() cache.""" + + def test_cache_returns_same_object(self): + archive, module = _make_vector_archive() + vector = archive.resource + arr1 = vector._as_numpy_2d() + arr2 = vector._as_numpy_2d() + assert arr1 is arr2 + + def test_shape(self): + archive, module = _make_vector_archive() + vector = archive.resource + arr = vector._as_numpy_2d() + assert arr.shape == (len(vector), vector._element_type._SIZE_IN_BYTES) + assert arr.dtype == np.uint8 + + +class TestStructureSlots: + """Tests that Structure uses __slots__.""" + + def test_has_slots(self): + from flatdata.lib.structure import Structure + assert hasattr(Structure, '__slots__') + assert '_mem' in Structure.__slots__ + assert '_pos' in Structure.__slots__ + + +class TestReadFieldVectorizedEdgeCases: + """Tests for boundary conditions in vectorized field reading.""" + + def test_1bit_unsigned(self): + raw = np.array([[0x01], [0x00], [0x03]], dtype=np.uint8) + result = read_field_vectorized(raw, 0, 1, False) + assert list(result) == [1, 0, 1] + + def test_1bit_signed_matches_scalar(self): + """1-bit signed fields should return 0 or 1, matching read_value behavior.""" + from flatdata.lib.data_access import read_value + raw = np.array([[0x01], [0x00]], dtype=np.uint8) + result = read_field_vectorized(raw, 0, 1, True) + assert int(result[0]) == read_value(b'\x01', 0, 1, True) + assert int(result[1]) == read_value(b'\x00', 0, 1, True) + + def test_64bit_unsigned(self): + raw = np.array([[0xFF] * 8], dtype=np.uint8) + result = read_field_vectorized(raw, 0, 64, False) + assert int(result[0]) == 0xFFFFFFFFFFFFFFFF + + def test_64bit_signed_negative(self): + raw = np.array([[0xFF] * 8], dtype=np.uint8) + result = read_field_vectorized(raw, 0, 64, True) + assert int(result[0]) == -1 + + def test_64bit_signed_positive(self): + raw = np.array([[0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]], dtype=np.uint8) + result = read_field_vectorized(raw, 0, 64, True) + assert int(result[0]) == 1 + + def test_63bit_signed(self): + raw = np.array([[0xFF] * 8], dtype=np.uint8) + result = read_field_vectorized(raw, 0, 63, True) + assert int(result[0]) == -1 + + def test_unaligned_large_field(self): + """Fields where offset%8 + width > 64 require extra byte merge.""" + raw = np.array([[0xFF] * 9], dtype=np.uint8) + # 64 bits starting at bit 1, all set → should be 0xFFFFFFFFFFFFFFFF + actual = int(read_field_vectorized(raw, 1, 64, False)[0]) + assert actual == 0xFFFFFFFFFFFFFFFF + + def test_empty_vector(self): + raw = np.zeros((0, 8), dtype=np.uint8) + result = read_field_vectorized(raw, 0, 32, False) + assert len(result) == 0 + + +class TestAttributeErrorContract: + """Vector/slice __getattr__ must raise AttributeError for unknown fields.""" + + def test_vector_unknown_field_raises_attribute_error(self): + archive, _ = _make_vector_archive() + with pytest.raises(AttributeError): + archive.resource.nonexistent_field + + def test_vector_hasattr_returns_false(self): + archive, _ = _make_vector_archive() + assert not hasattr(archive.resource, "nonexistent_field") + + def test_slice_unknown_field_raises_attribute_error(self): + archive, _ = _make_vector_archive() + with pytest.raises(AttributeError): + archive.resource[0:1].nonexistent_field