Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/py.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ jobs:
- name: Run tests
run: |
cd flatdata-py
uv run --with pytest --with ../flatdata-generator pytest -v
pip install .
flatdata-inspector --help
uv venv
uv pip install ../flatdata-generator
uv pip install ".[inspector]" pytest
.venv/bin/pytest -v
.venv/bin/flatdata-inspector --help

Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import flatdata.lib as flatdata
{{ struct.doc|to_python_doc}}
class {{ tree.namespace_path(struct, "_") }}_{{ struct.name }}(flatdata.structure.Structure):
"""{{ struct.doc|safe_py_string_line }}"""
__slots__ = ()
_SCHEMA = """{{ tree.schema(struct) }}"""
_NAME = "{{ tree.namespace_path(struct, "_") }}_{{ struct.name }}"
_SIZE_IN_BITS = {{ struct.size_in_bits }}
Expand Down
2 changes: 1 addition & 1 deletion flatdata-generator/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "flatdata-generator"
version = "0.4.10"
version = "0.4.11"
description = "Generate source code for C++, Rust, Go or Python from a Flatdata schema file"
readme = "README.md"
authors = [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_S(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct S
{
Expand All @@ -20,6 +21,7 @@ class n_S(flatdata.structure.Structure):

class n_T(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct T
{
Expand All @@ -40,6 +42,7 @@ class n_T(flatdata.structure.Structure):
# Builtin type to for MultiVector index
class n__builtin_multivector_IndexType8(flatdata.structure.Structure):
"""/** Builtin type to for MultiVector index */"""
__slots__ = ()
_SCHEMA = """"""
_NAME = "n__builtin_multivector_IndexType8"
_SIZE_IN_BITS = 8
Expand All @@ -53,6 +56,7 @@ class n__builtin_multivector_IndexType8(flatdata.structure.Structure):
# Builtin type to for MultiVector index
class n__builtin_multivector_IndexType16(flatdata.structure.Structure):
"""/** Builtin type to for MultiVector index */"""
__slots__ = ()
_SCHEMA = """"""
_NAME = "n__builtin_multivector_IndexType16"
_SIZE_IN_BITS = 16
Expand All @@ -66,6 +70,7 @@ class n__builtin_multivector_IndexType16(flatdata.structure.Structure):
# Builtin type to for MultiVector index
class n__builtin_multivector_IndexType64(flatdata.structure.Structure):
"""/** Builtin type to for MultiVector index */"""
__slots__ = ()
_SCHEMA = """"""
_NAME = "n__builtin_multivector_IndexType64"
_SIZE_IN_BITS = 64
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_S(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct S
{
Expand Down Expand Up @@ -92,6 +93,7 @@ def __init__(self, resource_storage):

class m_S(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace m {
struct S
{
Expand Down Expand Up @@ -184,6 +186,7 @@ def __init__(self, resource_storage):
# Builtin type to for MultiVector index
class a__builtin_multivector_IndexType32(flatdata.structure.Structure):
"""/** Builtin type to for MultiVector index */"""
__slots__ = ()
_SCHEMA = """"""
_NAME = "a__builtin_multivector_IndexType32"
_SIZE_IN_BITS = 32
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_S(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct S
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_S(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct S
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_S(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct S
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# This is a comment about Foo
class n_Foo(flatdata.structure.Structure):
"""// This is a comment about Foo"""
__slots__ = ()
_SCHEMA = """namespace n {
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# This is a comment about Foo
class n_Foo(flatdata.structure.Structure):
"""// This is a comment about Foo"""
"""// This is a comment about Foo"""
__slots__ = ()
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

class n_U8(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U8
{
Expand All @@ -32,6 +33,7 @@ class n_U8(flatdata.structure.Structure):

class n_I8(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I8
{
Expand All @@ -52,6 +54,7 @@ class n_I8(flatdata.structure.Structure):

class n_U16(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U16
{
Expand All @@ -72,6 +75,7 @@ class n_U16(flatdata.structure.Structure):

class n_I16(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I16
{
Expand All @@ -92,6 +96,7 @@ class n_I16(flatdata.structure.Structure):

class n_U32(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U32
{
Expand All @@ -112,6 +117,7 @@ class n_U32(flatdata.structure.Structure):

class n_I32(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I32
{
Expand All @@ -132,6 +138,7 @@ class n_I32(flatdata.structure.Structure):

class n_U64(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U64
{
Expand All @@ -152,6 +159,7 @@ class n_U64(flatdata.structure.Structure):

class n_I64(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I64
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_Foo(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct Foo
{
Expand All @@ -20,6 +21,7 @@ class n_Foo(flatdata.structure.Structure):

class m_Foo(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace m {
struct Foo
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class n_U8(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U8
{
Expand All @@ -23,6 +24,7 @@ class n_U8(flatdata.structure.Structure):

class n_I8(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I8
{
Expand All @@ -46,6 +48,7 @@ class n_I8(flatdata.structure.Structure):

class n_U16(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U16
{
Expand All @@ -69,6 +72,7 @@ class n_U16(flatdata.structure.Structure):

class n_I16(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I16
{
Expand All @@ -92,6 +96,7 @@ class n_I16(flatdata.structure.Structure):

class n_U32(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U32
{
Expand All @@ -115,6 +120,7 @@ class n_U32(flatdata.structure.Structure):

class n_I32(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I32
{
Expand All @@ -138,6 +144,7 @@ class n_I32(flatdata.structure.Structure):

class n_U64(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct U64
{
Expand All @@ -161,6 +168,7 @@ class n_U64(flatdata.structure.Structure):

class n_I64(flatdata.structure.Structure):
""""""
__slots__ = ()
_SCHEMA = """namespace n {
struct I64
{
Expand Down
31 changes: 31 additions & 0 deletions flatdata-py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
```

## Performance tips

`flatdata-py` supports two data access patterns with very different performance characteristics on large archives.

Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:

```python
count = sum(1 for x in archive.links if x.speed_limit > 100)
```

For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:

```python
# single column access, returns a pandas DataFrame
df = archive.links.speed_limit
count = len(df[df['speed_limit'] > 100])

# full NumPy structured array with all fields
arr = archive.links.to_numpy()
count = int(np.sum(arr['speed_limit'] > 100))

# slices work too
arr = archive.links[1000:2000].to_numpy()
df = archive.links[::10].to_data_frame()
```

* Use `vector.field_name` (column access) when you only need one or a few fields.
* Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
* Use `vector[i].field` for random access to individual elements.
* The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.

## Using the inspector

`flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:
Expand Down
4 changes: 2 additions & 2 deletions flatdata-py/flatdata/lib/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ def __init__(self, resource_storage):
self.__getattr__(name)

def __getattr__(self, name):
if name not in list(self._RESOURCES.keys()):
if name not in self._RESOURCES:
raise AttributeError("Resource %s not defined in archive." % name)
if name not in list(self._loaded_resources.keys()):
if name not in self._loaded_resources:
self._loaded_resources[name] = self._open_resource(name)
return self._loaded_resources[name]

Expand Down
50 changes: 49 additions & 1 deletion flatdata-py/flatdata/lib/data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
See the LICENSE file in the root of this project for license details.
'''

import numpy as np

# Sign bits cache for the value reading.
_SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]

Expand All @@ -20,7 +22,7 @@ def read_value(data, offset_bits, num_bits, is_signed):
remainder = data[offset_bytes + total_bytes]
result |= remainder << (total_bytes * 8 - offset_extra_bits)

if num_bits < 64:
if num_bits < 64 or offset_extra_bits > 0:
result = result & ((1 << num_bits) - 1)

if not is_signed:
Expand Down Expand Up @@ -62,3 +64,49 @@ def write_value(data, offset_bits, num_bits, is_signed, value):
surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
data[offset_bytes + byte_idx] |= surrounding_bits


def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
"""Read a bit-packed field from all elements at once, returning a numpy array.

:param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
:param field_offset_bits: bit offset of the field within each element
:param field_width_bits: width of the field in bits (max 64)
:param is_signed: whether to sign-extend the result
:return: numpy array of field values
"""
if field_width_bits == 1:
byte_idx = field_offset_bits // 8
bit_idx = field_offset_bits % 8
return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
np.uint64(1))

byte_start = field_offset_bits // 8
bit_shift = field_offset_bits % 8
bytes_needed = (bit_shift + field_width_bits + 7) // 8

# Use Python int arithmetic for the shift to avoid numpy overflow,
# then broadcast back to the array.
result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
for b in range(min(bytes_needed, 8)):
result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
result >>= np.uint64(bit_shift)

# If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
if bits_so_far < field_width_bits and bytes_needed > 8:
extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
result |= extra << np.uint64(bits_so_far)

if field_width_bits < 64:
result &= np.uint64((1 << field_width_bits) - 1)

if is_signed:
if field_width_bits == 64:
return result.view(np.int64)
sign_bit = np.uint64(1 << (field_width_bits - 1))
offset = -(1 << field_width_bits)
signed = result.astype(np.int64) + np.int64(offset)
result = np.where(result & sign_bit, signed, result.astype(np.int64))

return result
Loading
Loading