PyArrow FileSystem adapter¶
Use any Store as a pyarrow.fs.FileSystem for Parquet, CSV, and dataset I/O.
"""PyArrow FileSystem adapter — Use any Store as a `pyarrow.fs.FileSystem` for Parquet, CSV, and dataset I/O.
Requires: pip install "remote-store[arrow]"
Demonstrates:
- Creating a PyArrow filesystem from a Store
- Writing and reading Parquet files through the adapter
- Dataset discovery
---
see_also:
- label: PyArrow Adapter
url: ../../guides/pyarrow-adapter.md
note: PyArrow filesystem integration guide
"""
from __future__ import annotations
from typing import Any
try:
import pyarrow as pa
import pyarrow.parquet as pq
except ImportError as _exc:
print("This example requires PyArrow: pip install 'remote-store[arrow]'")
raise SystemExit(1) from _exc
from remote_store import Store
from remote_store.backends import MemoryBackend
from remote_store.ext.arrow import pyarrow_fs
def demo(store: Store) -> dict[str, Any]:
"""PyArrow filesystem: Parquet round-trip and dataset discovery. Returns results dict."""
import pyarrow.dataset as ds
results: dict[str, Any] = {}
fs = pyarrow_fs(store)
results["type_name"] = fs.type_name
print(f"Filesystem type: {fs.type_name}")
# Write a Parquet file
table = pa.table({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]})
pq.write_table(table, "people.parquet", filesystem=fs)
print("Wrote people.parquet")
# Read it back
result = pq.read_table("people.parquet", filesystem=fs)
results["people_rows"] = result.num_rows
results["people_data"] = result.to_pydict()
print(f"Read back {result.num_rows} rows:")
print(result.to_pydict())
# File info
info = fs.get_file_info("people.parquet")
results["file_size"] = info.size
print(f"\nFile info: type={info.type}, size={info.size} bytes")
# Write multiple files for dataset discovery
for i in range(3):
part = pa.table({"value": [i * 10 + j for j in range(5)]})
pq.write_table(part, f"dataset/part{i}.parquet", filesystem=fs)
print("\nWrote 3 partitions to dataset/")
# Discover and read all partitions
dataset = ds.dataset("dataset", filesystem=fs, format="parquet")
all_data = dataset.to_table()
results["dataset_rows"] = all_data.num_rows
results["dataset_files"] = len(dataset.files)
print(f"Dataset: {all_data.num_rows} total rows from {len(dataset.files)} files")
return results
def main() -> None:
backend = MemoryBackend()
store = Store(backend=backend)
demo(store)
store.close()
print("\nDone!")
if __name__ == "__main__":
main()
See also¶
- PyArrow Adapter — PyArrow filesystem integration guide
- Source:
examples/integrations/pyarrow_adapter.py