Source code for helix_ir.sources.json_source
"""JSON file source."""
from __future__ import annotations
import json
import os
from typing import Any, Iterable
[docs]
class JSONSource:
"""Read documents from a JSON or NDJSON file."""
def __init__(self, path: str, format: str = "auto") -> None:
"""
Args:
path: Path to a JSON file (array of objects) or NDJSON file.
format: 'json', 'ndjson', or 'auto' (detect by extension).
"""
self.path = path
self._format = format
def _detect_format(self) -> str:
ext = os.path.splitext(self.path)[1].lower()
if ext in (".ndjson", ".jsonl"):
return "ndjson"
return "json"
[docs]
def read(self) -> Iterable[dict[str, Any]]:
fmt = self._format if self._format != "auto" else self._detect_format()
with open(self.path, encoding="utf-8") as f:
if fmt == "ndjson":
for line in f:
line = line.strip()
if line:
yield json.loads(line)
else:
data = json.load(f)
if isinstance(data, list):
yield from data
elif isinstance(data, dict):
yield data
[docs]
def schema_hint(self) -> dict[str, Any] | None:
return None