Skip to content

canvod.store_metadata API Reference

Rich provenance metadata for Icechunk stores, aligned with DataCite 4.5, ACDD 1.3, and STAC 1.1.

Package

canvod-store-metadata — Store-level provenance for Icechunk stores.

This package manages store metadata: identity, creator, environment, software provenance, and standards compliance (DataCite/ACDD/STAC).

Not to be confused with the file registry in canvod.store ({group}/metadata/table), which tracks individual ingested files.

StoreMetadata

Bases: BaseModel

Root metadata model composing all 11 sections.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/schema.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
class StoreMetadata(BaseModel):
    """Root metadata model composing all 11 sections."""

    metadata_version: str = Field(_METADATA_VERSION, description="Schema version")
    identity: StoreIdentity
    creator: Creator
    publisher: Publisher = Field(default_factory=Publisher)
    temporal: TemporalExtent
    spatial: SpatialExtent
    instruments: Instruments = Field(default_factory=Instruments)
    processing: ProcessingProvenance = Field(default_factory=ProcessingProvenance)
    environment: Environment = Field(default_factory=Environment)
    config: ConfigSnapshot = Field(default_factory=ConfigSnapshot)
    references: References = Field(default_factory=References)
    summaries: Summaries = Field(default_factory=Summaries)

    def to_root_attrs(self) -> dict[str, Any]:
        """Serialize to a JSON-compatible dict for Zarr root attrs."""
        return {_METADATA_KEY: self.model_dump(mode="json")}

    @classmethod
    def from_root_attrs(cls, attrs: dict[str, Any]) -> StoreMetadata:
        """Reconstruct from Zarr root attrs dict."""
        data = attrs[_METADATA_KEY]
        return cls.model_validate(data)

to_root_attrs()

Serialize to a JSON-compatible dict for Zarr root attrs.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/schema.py
223
224
225
def to_root_attrs(self) -> dict[str, Any]:
    """Serialize to a JSON-compatible dict for Zarr root attrs."""
    return {_METADATA_KEY: self.model_dump(mode="json")}

from_root_attrs(attrs) classmethod

Reconstruct from Zarr root attrs dict.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/schema.py
227
228
229
230
231
@classmethod
def from_root_attrs(cls, attrs: dict[str, Any]) -> StoreMetadata:
    """Reconstruct from Zarr root attrs dict."""
    data = attrs[_METADATA_KEY]
    return cls.model_validate(data)

collect_metadata(*, config, site_name, site_config, store_type, source_format, store_path=None, dask_workers=None, dask_threads_per_worker=None)

Collect all metadata sections into a StoreMetadata object.

This is the main entry point for metadata collection.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def collect_metadata(
    *,
    config: Any,
    site_name: str,
    site_config: Any,
    store_type: str,
    source_format: str,
    store_path: Path | None = None,
    dask_workers: int | None = None,
    dask_threads_per_worker: int | None = None,
) -> StoreMetadata:
    """Collect all metadata sections into a StoreMetadata object.

    This is the main entry point for metadata collection.
    """
    now = datetime.now(UTC).isoformat()
    meta_cfg = config.processing.metadata

    return StoreMetadata(
        identity=StoreIdentity(
            id=f"{site_name}/{store_type}",
            title=f"{site_name} {store_type.replace('_', ' ').title()}",
            store_type=store_type,
            source_format=source_format,
            keywords=["GNSS", "VOD", site_name, source_format],
            naming_authority=getattr(meta_cfg, "naming_authority", None),
        ),
        creator=collect_creator(meta_cfg),
        publisher=collect_publisher(meta_cfg),
        temporal=TemporalExtent(created=now, updated=now),
        spatial=collect_spatial(site_config, site_name),
        instruments=collect_instruments(site_config),
        processing=collect_processing_provenance(store_type, source_format),
        environment=collect_environment(
            store_path, dask_workers, dask_threads_per_worker
        ),
        config=collect_config_snapshot(config),
        references=collect_references(config),
        summaries=Summaries(
            history=[f"{now}: Store created ({source_format})"],
        ),
    )

write_metadata(store_path, metadata, branch='main')

Write metadata to Icechunk store root attrs.

Returns

str Snapshot ID from the commit.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def write_metadata(
    store_path: Path,
    metadata: StoreMetadata,
    branch: str = "main",
) -> str:
    """Write metadata to Icechunk store root attrs.

    Returns
    -------
    str
        Snapshot ID from the commit.
    """
    repo = _open_repo(store_path)
    session = repo.writable_session(branch)
    store = session.store
    try:
        root = zarr.open_group(store, mode="r+")
    except zarr_errors.GroupNotFoundError:
        root = zarr.open_group(store, mode="w")

    root.attrs.update(metadata.to_root_attrs())
    return session.commit("Write store metadata")

read_metadata(store_path, branch='main')

Read metadata from Icechunk store root attrs.

Raises

KeyError If no metadata found in store.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def read_metadata(
    store_path: Path,
    branch: str = "main",
) -> StoreMetadata:
    """Read metadata from Icechunk store root attrs.

    Raises
    ------
    KeyError
        If no metadata found in store.
    """
    repo = _open_repo(store_path)
    session = repo.readonly_session(branch=branch)
    store = session.store
    root = zarr.open_group(store, mode="r")
    attrs = dict(root.attrs)
    return StoreMetadata.from_root_attrs(attrs)

update_metadata(store_path, updates, branch='main')

Merge updates into existing metadata.

Supports dotted keys for nested updates (e.g. "temporal.updated").

Returns

str Snapshot ID from the commit.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def update_metadata(
    store_path: Path,
    updates: dict[str, Any],
    branch: str = "main",
) -> str:
    """Merge updates into existing metadata.

    Supports dotted keys for nested updates (e.g. "temporal.updated").

    Returns
    -------
    str
        Snapshot ID from the commit.
    """
    existing = read_metadata(store_path, branch)
    data = existing.model_dump(mode="json")

    for key, value in updates.items():
        parts = key.split(".")
        target = data
        for part in parts[:-1]:
            target = target[part]
        target[parts[-1]] = value

    updated = StoreMetadata.model_validate(data)

    repo = _open_repo(store_path)
    session = repo.writable_session(branch)
    store = session.store
    try:
        root = zarr.open_group(store, mode="r+")
    except zarr_errors.GroupNotFoundError:
        root = zarr.open_group(store, mode="w")

    root.attrs.update(updated.to_root_attrs())
    return session.commit("Update store metadata")

metadata_exists(store_path, branch='main')

Check if metadata exists in the store.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
102
103
104
105
106
107
108
109
110
111
def metadata_exists(store_path: Path, branch: str = "main") -> bool:
    """Check if metadata exists in the store."""
    try:
        repo = _open_repo(store_path)
        session = repo.readonly_session(branch=branch)
        store = session.store
        root = zarr.open_group(store, mode="r")
        return _METADATA_KEY in root.attrs
    except Exception:
        return False

validate_all(metadata)

Run all validators, return issues grouped by standard.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/validate.py
194
195
196
197
198
199
200
201
def validate_all(metadata: StoreMetadata) -> dict[str, list[str]]:
    """Run all validators, return issues grouped by standard."""
    return {
        "fair": validate_fair(metadata),
        "datacite": validate_datacite(metadata),
        "acdd": validate_acdd(metadata),
        "stac": validate_stac(metadata),
    }

scan_stores(root_dir, recursive=True)

Walk directories, find Icechunk stores, build a catalog.

Returns

pl.DataFrame One row per store with metadata columns.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/inventory.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def scan_stores(root_dir: Path, recursive: bool = True) -> pl.DataFrame:
    """Walk directories, find Icechunk stores, build a catalog.

    Returns
    -------
    pl.DataFrame
        One row per store with metadata columns.
    """
    store_paths = _find_icechunk_stores(root_dir, recursive)
    rows: list[dict[str, Any]] = []

    for sp in store_paths:
        if metadata_exists(sp):
            try:
                meta = read_metadata(sp)
                rows.append(_metadata_to_row(sp, meta))
            except Exception:
                rows.append(_empty_row(sp))
        else:
            rows.append(_empty_row(sp))

    if not rows:
        return pl.DataFrame(schema=_SCHEMA)

    return pl.DataFrame(rows)

Schema

Pydantic models for store metadata (11 sections, ~90 fields).

Aligns with FAIR data principles, DataCite 4.5, ACDD 1.3, STAC 1.1, and W3C PROV.

StoreMetadata

Bases: BaseModel

Root metadata model composing all 11 sections.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/schema.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
class StoreMetadata(BaseModel):
    """Root metadata model composing all 11 sections."""

    metadata_version: str = Field(_METADATA_VERSION, description="Schema version")
    identity: StoreIdentity
    creator: Creator
    publisher: Publisher = Field(default_factory=Publisher)
    temporal: TemporalExtent
    spatial: SpatialExtent
    instruments: Instruments = Field(default_factory=Instruments)
    processing: ProcessingProvenance = Field(default_factory=ProcessingProvenance)
    environment: Environment = Field(default_factory=Environment)
    config: ConfigSnapshot = Field(default_factory=ConfigSnapshot)
    references: References = Field(default_factory=References)
    summaries: Summaries = Field(default_factory=Summaries)

    def to_root_attrs(self) -> dict[str, Any]:
        """Serialize to a JSON-compatible dict for Zarr root attrs."""
        return {_METADATA_KEY: self.model_dump(mode="json")}

    @classmethod
    def from_root_attrs(cls, attrs: dict[str, Any]) -> StoreMetadata:
        """Reconstruct from Zarr root attrs dict."""
        data = attrs[_METADATA_KEY]
        return cls.model_validate(data)

to_root_attrs()

Serialize to a JSON-compatible dict for Zarr root attrs.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/schema.py
223
224
225
def to_root_attrs(self) -> dict[str, Any]:
    """Serialize to a JSON-compatible dict for Zarr root attrs."""
    return {_METADATA_KEY: self.model_dump(mode="json")}

from_root_attrs(attrs) classmethod

Reconstruct from Zarr root attrs dict.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/schema.py
227
228
229
230
231
@classmethod
def from_root_attrs(cls, attrs: dict[str, Any]) -> StoreMetadata:
    """Reconstruct from Zarr root attrs dict."""
    data = attrs[_METADATA_KEY]
    return cls.model_validate(data)

Collectors

Runtime collectors — pure functions that gather metadata.

collect_software_versions()

Collect versions of key packages via importlib.metadata.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def collect_software_versions() -> dict[str, str]:
    """Collect versions of key packages via importlib.metadata."""
    import importlib.metadata

    packages = [
        "canvodpy",
        "canvod-readers",
        "canvod-store",
        "canvod-store-metadata",
        "canvod-utils",
        "icechunk",
        "zarr",
        "xarray",
        "dask",
        "numpy",
        "polars",
        "pydantic",
    ]
    versions: dict[str, str] = {}
    for pkg in packages:
        try:
            versions[pkg] = importlib.metadata.version(pkg)
        except importlib.metadata.PackageNotFoundError:
            pass
    return versions

collect_python_info()

Return Python version + implementation.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
65
66
67
def collect_python_info() -> str:
    """Return Python version + implementation."""
    return f"{sys.version} ({platform.python_implementation()})"

collect_uv_version()

Return uv version or None if not installed.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def collect_uv_version() -> str | None:
    """Return uv version or None if not installed."""
    try:
        result = subprocess.run(
            ["uv", "--version"],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode == 0:
            return result.stdout.strip()
    except FileNotFoundError, subprocess.TimeoutExpired:
        pass
    return None

collect_environment(store_path=None, dask_workers=None, dask_threads_per_worker=None)

Collect runtime environment information.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def collect_environment(
    store_path: Path | None = None,
    dask_workers: int | None = None,
    dask_threads_per_worker: int | None = None,
) -> Environment:
    """Collect runtime environment information."""
    memory_gb = None
    try:
        import psutil

        memory_gb = round(psutil.virtual_memory().total / (1024**3), 1)
    except ImportError:
        pass

    disk_free_gb = None
    if store_path is not None:
        check_path = store_path if store_path.exists() else store_path.parent
        if check_path.exists():
            usage = shutil.disk_usage(check_path)
            disk_free_gb = round(usage.free / (1024**3), 1)

    # Read raw pyproject.toml + uv.lock from monorepo root
    pyproject_text = None
    uv_lock_text = None
    uv_lock_hash = None
    root = _find_monorepo_root()
    if root is not None:
        pyproject_text = _read_file_text(root / "pyproject.toml")
        uv_lock_text = _read_file_text(root / "uv.lock")
        uv_lock_hash = _read_uv_lock_hash(root)

    return Environment(
        hostname=socket.gethostname(),
        os=platform.platform(),
        arch=platform.machine(),
        cpu_count=os.cpu_count(),
        memory_gb=memory_gb,
        disk_free_gb=disk_free_gb,
        dask_workers=dask_workers,
        dask_threads_per_worker=dask_threads_per_worker,
        uv_lock_hash=uv_lock_hash,
        pyproject_toml_text=pyproject_text,
        uv_lock_text=uv_lock_text,
    )

collect_config_snapshot(config)

Serialize config sections + compute SHA256 hash.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def collect_config_snapshot(config: Any) -> ConfigSnapshot:
    """Serialize config sections + compute SHA256 hash."""
    sections: dict[str, Any] = {}
    keys = (
        "processing",
        "preprocessing",
        "aux_data",
        "compression",
        "icechunk",
        "sids",
    )
    for key in keys:
        if isinstance(config, dict):
            obj = config.get(key)
        else:
            obj = getattr(config, key, None)
        if obj is None:
            proc = getattr(config, "processing", None)
            if proc is not None:
                obj = getattr(proc, key, None)
        if obj is not None:
            if hasattr(obj, "model_dump"):
                sections[key] = obj.model_dump(mode="json")
            elif isinstance(obj, dict):
                sections[key] = obj
            else:
                sections[key] = str(obj)

    config_str = json.dumps(sections, sort_keys=True, default=str)
    config_hash = hashlib.sha256(config_str.encode()).hexdigest()

    return ConfigSnapshot(
        processing=sections.get("processing"),
        preprocessing=sections.get("preprocessing"),
        aux_data=sections.get("aux_data"),
        compression=sections.get("compression"),
        icechunk=sections.get("icechunk"),
        sids=sections.get("sids"),
        config_hash=config_hash,
    )

collect_creator(metadata_config)

Build Creator from MetadataConfig.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
208
209
210
211
212
213
214
215
216
217
218
219
def collect_creator(metadata_config: Any) -> Creator:
    """Build Creator from MetadataConfig."""
    return Creator(
        name=metadata_config.author,
        email=str(metadata_config.email),
        orcid=getattr(metadata_config, "orcid", None),
        institution=metadata_config.institution,
        institution_ror=getattr(metadata_config, "institution_ror", None),
        department=getattr(metadata_config, "department", None),
        research_group=getattr(metadata_config, "research_group", None),
        website=getattr(metadata_config, "website", None),
    )

collect_publisher(metadata_config)

Build Publisher from MetadataConfig.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
222
223
224
225
226
227
228
def collect_publisher(metadata_config: Any) -> Publisher:
    """Build Publisher from MetadataConfig."""
    return Publisher(
        name=getattr(metadata_config, "publisher", None),
        url=getattr(metadata_config, "publisher_url", None),
        license=getattr(metadata_config, "license", None),
    )

collect_spatial(site_config, site_name)

Build SpatialExtent from SiteConfig.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
def collect_spatial(site_config: Any, site_name: str) -> SpatialExtent:
    """Build SpatialExtent from SiteConfig."""
    lat = getattr(site_config, "latitude", None)
    lon = getattr(site_config, "longitude", None)
    alt = getattr(site_config, "altitude_m", None)

    bbox = None
    if lat is not None and lon is not None:
        bbox = [lon, lat, lon, lat]

    return SpatialExtent(
        site=SiteInfo(
            name=site_name,
            description=getattr(site_config, "description", None),
            country=getattr(site_config, "country", None),
        ),
        geospatial_lat=lat,
        geospatial_lon=lon,
        geospatial_alt_m=alt,
        geospatial_lat_min=lat,
        geospatial_lat_max=lat,
        geospatial_lon_min=lon,
        geospatial_lon_max=lon,
        bbox=bbox,
    )

collect_instruments(site_config)

Build Instruments from SiteConfig receivers.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
258
259
260
261
262
263
264
265
266
267
268
269
270
def collect_instruments(site_config: Any) -> Instruments:
    """Build Instruments from SiteConfig receivers."""
    receivers: dict[str, ReceiverInfo] = {}
    for name, rcv in site_config.receivers.items():
        receivers[name] = ReceiverInfo(
            type=rcv.type,
            directory=rcv.directory,
            reader_format=getattr(rcv, "reader_format", "auto"),
            description=getattr(rcv, "description", None),
            recipe=getattr(rcv, "recipe", None),
            metadata=getattr(rcv, "metadata", None),
        )
    return Instruments(receivers=receivers)

collect_processing_provenance(store_type, source_format)

Build ProcessingProvenance.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def collect_processing_provenance(
    store_type: str,
    source_format: str,
) -> ProcessingProvenance:
    """Build ProcessingProvenance."""
    now = datetime.now(UTC).isoformat()
    return ProcessingProvenance(
        software=collect_software_versions(),
        python=collect_python_info(),
        uv_version=collect_uv_version(),
        level="L1" if store_type == "rinex_store" else "L2",
        lineage=f"Raw {source_format} data ingested into Icechunk store",
        facility=socket.gethostname(),
        datetime=now,
    )

collect_references(config)

Build References from config if available.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def collect_references(config: Any) -> References:
    """Build References from config if available."""
    from .schema import FundingRef, PublicationRef

    refs_config = None
    proc = getattr(config, "processing", None)
    if proc is not None:
        refs_config = getattr(proc, "references", None)

    if refs_config is None:
        return References()

    pubs = [
        PublicationRef(doi=p.doi, citation=getattr(p, "citation", None))
        for p in getattr(refs_config, "publications", [])
    ]
    funding = [
        FundingRef(
            funder=f.funder,
            funder_ror=getattr(f, "funder_ror", None),
            grant_number=getattr(f, "grant_number", None),
            award_title=getattr(f, "award_title", None),
        )
        for f in getattr(refs_config, "funding", [])
    ]
    return References(publications=pubs, funding=funding)

collect_metadata(*, config, site_name, site_config, store_type, source_format, store_path=None, dask_workers=None, dask_threads_per_worker=None)

Collect all metadata sections into a StoreMetadata object.

This is the main entry point for metadata collection.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/collectors.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def collect_metadata(
    *,
    config: Any,
    site_name: str,
    site_config: Any,
    store_type: str,
    source_format: str,
    store_path: Path | None = None,
    dask_workers: int | None = None,
    dask_threads_per_worker: int | None = None,
) -> StoreMetadata:
    """Collect all metadata sections into a StoreMetadata object.

    This is the main entry point for metadata collection.
    """
    now = datetime.now(UTC).isoformat()
    meta_cfg = config.processing.metadata

    return StoreMetadata(
        identity=StoreIdentity(
            id=f"{site_name}/{store_type}",
            title=f"{site_name} {store_type.replace('_', ' ').title()}",
            store_type=store_type,
            source_format=source_format,
            keywords=["GNSS", "VOD", site_name, source_format],
            naming_authority=getattr(meta_cfg, "naming_authority", None),
        ),
        creator=collect_creator(meta_cfg),
        publisher=collect_publisher(meta_cfg),
        temporal=TemporalExtent(created=now, updated=now),
        spatial=collect_spatial(site_config, site_name),
        instruments=collect_instruments(site_config),
        processing=collect_processing_provenance(store_type, source_format),
        environment=collect_environment(
            store_path, dask_workers, dask_threads_per_worker
        ),
        config=collect_config_snapshot(config),
        references=collect_references(config),
        summaries=Summaries(
            history=[f"{now}: Store created ({source_format})"],
        ),
    )

Store I/O

Read/write metadata to/from Icechunk store root attrs.

write_metadata(store_path, metadata, branch='main')

Write metadata to Icechunk store root attrs.

Returns

str Snapshot ID from the commit.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def write_metadata(
    store_path: Path,
    metadata: StoreMetadata,
    branch: str = "main",
) -> str:
    """Write metadata to Icechunk store root attrs.

    Returns
    -------
    str
        Snapshot ID from the commit.
    """
    repo = _open_repo(store_path)
    session = repo.writable_session(branch)
    store = session.store
    try:
        root = zarr.open_group(store, mode="r+")
    except zarr_errors.GroupNotFoundError:
        root = zarr.open_group(store, mode="w")

    root.attrs.update(metadata.to_root_attrs())
    return session.commit("Write store metadata")

read_metadata(store_path, branch='main')

Read metadata from Icechunk store root attrs.

Raises

KeyError If no metadata found in store.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def read_metadata(
    store_path: Path,
    branch: str = "main",
) -> StoreMetadata:
    """Read metadata from Icechunk store root attrs.

    Raises
    ------
    KeyError
        If no metadata found in store.
    """
    repo = _open_repo(store_path)
    session = repo.readonly_session(branch=branch)
    store = session.store
    root = zarr.open_group(store, mode="r")
    attrs = dict(root.attrs)
    return StoreMetadata.from_root_attrs(attrs)

update_metadata(store_path, updates, branch='main')

Merge updates into existing metadata.

Supports dotted keys for nested updates (e.g. "temporal.updated").

Returns

str Snapshot ID from the commit.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def update_metadata(
    store_path: Path,
    updates: dict[str, Any],
    branch: str = "main",
) -> str:
    """Merge updates into existing metadata.

    Supports dotted keys for nested updates (e.g. "temporal.updated").

    Returns
    -------
    str
        Snapshot ID from the commit.
    """
    existing = read_metadata(store_path, branch)
    data = existing.model_dump(mode="json")

    for key, value in updates.items():
        parts = key.split(".")
        target = data
        for part in parts[:-1]:
            target = target[part]
        target[parts[-1]] = value

    updated = StoreMetadata.model_validate(data)

    repo = _open_repo(store_path)
    session = repo.writable_session(branch)
    store = session.store
    try:
        root = zarr.open_group(store, mode="r+")
    except zarr_errors.GroupNotFoundError:
        root = zarr.open_group(store, mode="w")

    root.attrs.update(updated.to_root_attrs())
    return session.commit("Update store metadata")

metadata_exists(store_path, branch='main')

Check if metadata exists in the store.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/io.py
102
103
104
105
106
107
108
109
110
111
def metadata_exists(store_path: Path, branch: str = "main") -> bool:
    """Check if metadata exists in the store."""
    try:
        repo = _open_repo(store_path)
        session = repo.readonly_session(branch=branch)
        store = session.store
        root = zarr.open_group(store, mode="r")
        return _METADATA_KEY in root.attrs
    except Exception:
        return False

Validation

Validate metadata completeness against FAIR, DataCite 4.5, ACDD 1.3, STAC 1.1.

validate_datacite(metadata)

Check DataCite 4.5 mandatory fields.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/validate.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def validate_datacite(metadata: StoreMetadata) -> list[str]:
    """Check DataCite 4.5 mandatory fields."""
    issues: list[str] = []

    checks = [
        (metadata.identity, "id", "identity"),
        (metadata.identity, "title", "identity"),
        (metadata.creator, "name", "creator"),
        (metadata.creator, "institution", "creator"),
        (metadata.temporal, "created", "temporal"),
        (metadata.identity, "store_type", "identity"),
    ]
    for obj, field, label in checks:
        issue = _check_field(obj, field, label)
        if issue:
            issues.append(issue)

    if metadata.publisher.name is None:
        issues.append("publisher.name is missing (DataCite mandatory)")

    return issues

validate_acdd(metadata)

Check ACDD 1.3 highly recommended + recommended fields.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/validate.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def validate_acdd(metadata: StoreMetadata) -> list[str]:
    """Check ACDD 1.3 highly recommended + recommended fields."""
    issues: list[str] = []

    hr_checks = [
        (metadata.identity, "title", "identity"),
        (metadata.identity, "id", "identity"),
        (metadata.identity, "conventions", "identity"),
        (metadata.creator, "name", "creator"),
        (metadata.creator, "email", "creator"),
        (metadata.creator, "institution", "creator"),
        (metadata.temporal, "created", "temporal"),
    ]
    for obj, field, label in hr_checks:
        issue = _check_field(obj, field, label)
        if issue:
            issues.append(f"[highly recommended] {issue}")

    rec_checks = [
        (metadata.identity, "keywords", "identity"),
        (metadata.identity, "description", "identity"),
        (metadata.spatial, "geospatial_lat", "spatial"),
        (metadata.spatial, "geospatial_lon", "spatial"),
    ]
    for obj, field, label in rec_checks:
        issue = _check_field(obj, field, label)
        if issue:
            issues.append(f"[recommended] {issue}")

    return issues

validate_stac(metadata)

Check STAC 1.1 Collection required fields.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/validate.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def validate_stac(metadata: StoreMetadata) -> list[str]:
    """Check STAC 1.1 Collection required fields."""
    issues: list[str] = []

    if not metadata.identity.id:
        issues.append("identity.id required for STAC Collection")
    if not metadata.identity.title:
        issues.append("identity.title required for STAC Collection")
    if not metadata.identity.description:
        issues.append("identity.description recommended for STAC Collection")
    if metadata.spatial.bbox is None:
        issues.append("spatial.bbox required for STAC spatial extent")
    if metadata.spatial.extent_temporal_interval is None:
        issues.append(
            "spatial.extent_temporal_interval required for STAC temporal extent"
        )
    if metadata.publisher.license is None:
        issues.append("publisher.license required for STAC Collection")

    return issues

validate_fair(metadata)

Check FAIR data principles compliance.

Checks each sub-principle (F1–F4, A1–A2, I1–I3, R1.1–R1.3) and returns actionable issues for anything missing.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/validate.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def validate_fair(metadata: StoreMetadata) -> list[str]:
    """Check FAIR data principles compliance.

    Checks each sub-principle (F1–F4, A1–A2, I1–I3, R1.1–R1.3)
    and returns actionable issues for anything missing.
    """
    issues: list[str] = []

    # F1 — Globally unique, persistent identifier
    if not metadata.identity.id:
        issues.append(
            "[F1] identity.id is missing — data must have a unique identifier"
        )
    if not metadata.identity.persistent_identifier:
        issues.append(
            "[F1] identity.persistent_identifier is missing — "
            "assign a DOI or other persistent identifier for long-term findability"
        )

    # F2 — Rich metadata
    rich_fields = [
        (metadata.identity, "title", "identity"),
        (metadata.identity, "description", "identity"),
        (metadata.identity, "keywords", "identity"),
        (metadata.creator, "name", "creator"),
        (metadata.creator, "institution", "creator"),
        (metadata.temporal, "collected_start", "temporal"),
        (metadata.spatial, "geospatial_lat", "spatial"),
    ]
    missing_rich = sum(
        1 for obj, field, _ in rich_fields if _check_field(obj, field, "") is not None
    )
    if missing_rich > 2:
        issues.append(
            f"[F2] {missing_rich} of {len(rich_fields)} recommended descriptive "
            "fields are missing — rich metadata improves findability"
        )

    # F3 — Metadata clearly includes identifier of the data
    # Satisfied by design: identity.id is part of the metadata blob

    # F4 — Registered in searchable resource
    # We can't verify external registration, but we can check STAC readiness
    if metadata.spatial.bbox is None:
        issues.append(
            "[F4] spatial.bbox is missing — needed for STAC catalog registration"
        )

    # A1 — Retrievable by identifier using standardised protocol
    if not metadata.references.access_url:
        issues.append(
            "[A1] references.access_url is missing — "
            "provide a URL (S3, HTTP, or local) where data can be accessed"
        )

    # A2 — Metadata accessible even when data is no longer available
    # This is an architectural concern; we note it as advice
    # (metadata is coupled to data in root attrs)

    # I1 — Formal, shared knowledge representation
    # Satisfied: JSON in Zarr root attrs with Pydantic schema

    # I2 — Use FAIR-compliant vocabularies
    if not metadata.identity.conventions:
        issues.append(
            "[I2] identity.conventions is missing — "
            "declare metadata conventions (e.g. ACDD-1.3)"
        )

    # I3 — Qualified references to other metadata
    if not metadata.references.related_stores and not metadata.references.publications:
        issues.append(
            "[I3] No related_stores or publications — "
            "cross-references improve interoperability"
        )

    # R1.1 — Clear, accessible data usage license
    if not metadata.publisher.license:
        issues.append(
            "[R1.1] publisher.license is missing — "
            "specify an SPDX license (e.g. CC-BY-4.0) for reusability"
        )

    # R1.2 — Detailed provenance
    if not metadata.processing.software:
        issues.append(
            "[R1.2] processing.software is empty — "
            "record software versions for provenance"
        )
    if metadata.environment.uv_lock_hash is None:
        issues.append(
            "[R1.2] environment.uv_lock_hash is missing — "
            "store the lock file for environment reproducibility"
        )

    # R1.3 — Meet domain-relevant community standards
    # Checked by the other validators (DataCite, ACDD, STAC)

    return issues

validate_all(metadata)

Run all validators, return issues grouped by standard.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/validate.py
194
195
196
197
198
199
200
201
def validate_all(metadata: StoreMetadata) -> dict[str, list[str]]:
    """Run all validators, return issues grouped by standard."""
    return {
        "fair": validate_fair(metadata),
        "datacite": validate_datacite(metadata),
        "acdd": validate_acdd(metadata),
        "stac": validate_stac(metadata),
    }

Inventory

Store catalog builder — scan directories for Icechunk stores.

scan_stores(root_dir, recursive=True)

Walk directories, find Icechunk stores, build a catalog.

Returns

pl.DataFrame One row per store with metadata columns.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/inventory.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def scan_stores(root_dir: Path, recursive: bool = True) -> pl.DataFrame:
    """Walk directories, find Icechunk stores, build a catalog.

    Returns
    -------
    pl.DataFrame
        One row per store with metadata columns.
    """
    store_paths = _find_icechunk_stores(root_dir, recursive)
    rows: list[dict[str, Any]] = []

    for sp in store_paths:
        if metadata_exists(sp):
            try:
                meta = read_metadata(sp)
                rows.append(_metadata_to_row(sp, meta))
            except Exception:
                rows.append(_empty_row(sp))
        else:
            rows.append(_empty_row(sp))

    if not rows:
        return pl.DataFrame(schema=_SCHEMA)

    return pl.DataFrame(rows)

scan_stores_as_stac(root_dir)

Build a STAC Catalog JSON dict from stores.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/inventory.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def scan_stores_as_stac(root_dir: Path) -> dict[str, Any]:
    """Build a STAC Catalog JSON dict from stores."""
    store_paths = _find_icechunk_stores(root_dir)
    collections: list[dict[str, Any]] = []

    for sp in store_paths:
        if not metadata_exists(sp):
            continue
        try:
            meta = read_metadata(sp)
        except Exception:
            continue

        collection: dict[str, Any] = {
            "type": "Collection",
            "stac_version": "1.1.0",
            "id": meta.identity.id,
            "title": meta.identity.title,
            "description": meta.identity.description or "",
            "license": meta.publisher.license or "proprietary",
            "extent": {
                "spatial": {
                    "bbox": ([meta.spatial.bbox] if meta.spatial.bbox else [[]])
                },
                "temporal": {
                    "interval": (
                        meta.spatial.extent_temporal_interval
                        or [
                            [
                                meta.temporal.collected_start,
                                meta.temporal.collected_end,
                            ]
                        ]
                    )
                },
            },
            "links": [],
        }
        if meta.identity.keywords:
            collection["keywords"] = meta.identity.keywords
        collections.append(collection)

    return {
        "type": "Catalog",
        "stac_version": "1.1.0",
        "id": "canvod-catalog",
        "description": "canVOD Icechunk Store Catalog",
        "links": [{"rel": "child", "href": f"#{c['id']}"} for c in collections],
        "collections": collections,
    }

write_stac_collection(store_path, output_path=None, branch='main')

Write a STAC Collection JSON file for a single store.

Parameters

store_path : Path Path to the Icechunk store. output_path : Path | None Output JSON file path. Defaults to store_path / "collection.json". branch : str Store branch to read metadata from.

Returns

Path The written JSON file path.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/inventory.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def write_stac_collection(
    store_path: Path,
    output_path: Path | None = None,
    branch: str = "main",
) -> Path:
    """Write a STAC Collection JSON file for a single store.

    Parameters
    ----------
    store_path : Path
        Path to the Icechunk store.
    output_path : Path | None
        Output JSON file path. Defaults to ``store_path / "collection.json"``.
    branch : str
        Store branch to read metadata from.

    Returns
    -------
    Path
        The written JSON file path.
    """
    meta = read_metadata(store_path, branch)
    collection = _metadata_to_stac_collection(meta)

    if output_path is None:
        output_path = store_path / "collection.json"

    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json.dumps(collection, indent=2, default=str))
    return output_path

write_stac_catalog(root_dir, output_path=None, write_collections=True)

Write a STAC Catalog JSON and optional per-store Collection JSONs.

Parameters

root_dir : Path Root directory to scan for Icechunk stores. output_path : Path | None Output catalog JSON path. Defaults to root_dir / "catalog.json". write_collections : bool If True, also write a collection.json next to each store.

Returns

Path The written catalog JSON file path.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/inventory.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def write_stac_catalog(
    root_dir: Path,
    output_path: Path | None = None,
    write_collections: bool = True,
) -> Path:
    """Write a STAC Catalog JSON and optional per-store Collection JSONs.

    Parameters
    ----------
    root_dir : Path
        Root directory to scan for Icechunk stores.
    output_path : Path | None
        Output catalog JSON path. Defaults to ``root_dir / "catalog.json"``.
    write_collections : bool
        If True, also write a ``collection.json`` next to each store.

    Returns
    -------
    Path
        The written catalog JSON file path.
    """
    catalog = scan_stores_as_stac(root_dir)

    if output_path is None:
        output_path = root_dir / "catalog.json"

    # Rewrite links to point to actual collection.json files
    if write_collections:
        store_paths = _find_icechunk_stores(root_dir)
        for sp in store_paths:
            if not metadata_exists(sp):
                continue
            try:
                coll_path = write_stac_collection(sp)
                # Update catalog link to relative path
                rel = coll_path.relative_to(output_path.parent)
                for link in catalog["links"]:
                    meta = read_metadata(sp)
                    if link.get("href") == f"#{meta.identity.id}":
                        link["href"] = str(rel)
            except Exception:
                continue

    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json.dumps(catalog, indent=2, default=str))
    return output_path

Display

Human-readable metadata display and query tool.

Usage

from canvod.store_metadata import show_metadata
show_metadata("/path/to/store")                  # full report
show_metadata("/path/to/store", section="env")   # just environment
show_metadata("/path/to/store", section="uv")    # dump uv.lock

format_metadata(meta, section=None)

Format metadata as a human-readable string.

Parameters

meta : StoreMetadata The metadata to format. section : str | None If given, show only this section. Special values: - "uv" / "uv.lock": dump raw uv.lock content - "toml" / "pyproject": dump raw pyproject.toml content - "env-reproduce": print instructions to reproduce env - Any key from _SECTIONS: show that section only If None, show full report.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/show.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def format_metadata(
    meta: StoreMetadata,
    section: str | None = None,
) -> str:
    """Format metadata as a human-readable string.

    Parameters
    ----------
    meta : StoreMetadata
        The metadata to format.
    section : str | None
        If given, show only this section. Special values:
        - "uv" / "uv.lock": dump raw uv.lock content
        - "toml" / "pyproject": dump raw pyproject.toml content
        - "env-reproduce": print instructions to reproduce env
        - Any key from _SECTIONS: show that section only
        If None, show full report.
    """
    if section in ("uv", "uv.lock"):
        if meta.environment.uv_lock_text:
            return meta.environment.uv_lock_text
        return "(uv.lock not stored in this metadata)"

    if section in ("toml", "pyproject", "pyproject.toml"):
        if meta.environment.pyproject_toml_text:
            return meta.environment.pyproject_toml_text
        return "(pyproject.toml not stored in this metadata)"

    if section in ("env-reproduce", "reproduce"):
        return _format_reproduce_instructions(meta)

    if section is not None:
        formatter = _SECTIONS.get(section)
        if formatter is None:
            available = ", ".join(list(_SECTIONS.keys()) + ["uv", "toml", "reproduce"])
            return f"Unknown section '{section}'. Available: {available}"
        return formatter(meta)

    # Full report
    parts = [
        f"{'=' * 72}",
        "  canvod Store Metadata Report",
        f"  Schema version: {meta.metadata_version}",
        f"{'=' * 72}",
    ]
    for formatter in _SECTIONS.values():
        parts.append("")
        parts.append(formatter(meta))
    parts.append("")
    parts.append(_hr("="))
    return "\n".join(parts)

extract_env(store_path, output_dir, branch='main')

Extract pyproject.toml + uv.lock from a store for reproduction.

Parameters

store_path : Path Path to the Icechunk store. output_dir : Path Directory to write the files into. branch : str Store branch.

Returns

Path The output directory (ready for uv sync --frozen).

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/show.py
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
def extract_env(
    store_path: Path,
    output_dir: Path,
    branch: str = "main",
) -> Path:
    """Extract pyproject.toml + uv.lock from a store for reproduction.

    Parameters
    ----------
    store_path : Path
        Path to the Icechunk store.
    output_dir : Path
        Directory to write the files into.
    branch : str
        Store branch.

    Returns
    -------
    Path
        The output directory (ready for ``uv sync --frozen``).
    """
    meta = read_metadata(store_path, branch)
    output_dir.mkdir(parents=True, exist_ok=True)

    if meta.environment.pyproject_toml_text is None:
        msg = "pyproject.toml not stored in this metadata"
        raise ValueError(msg)
    if meta.environment.uv_lock_text is None:
        msg = "uv.lock not stored in this metadata"
        raise ValueError(msg)

    (output_dir / "pyproject.toml").write_text(meta.environment.pyproject_toml_text)
    (output_dir / "uv.lock").write_text(meta.environment.uv_lock_text)
    return output_dir

show_metadata(store_path, section=None, branch='main')

Print store metadata to stdout.

Parameters

store_path : str | Path Path to an Icechunk store. section : str | None Section to show (None = full report). branch : str Store branch.

Source code in packages/canvod-store-metadata/src/canvod/store_metadata/show.py
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
def show_metadata(
    store_path: str | Path,
    section: str | None = None,
    branch: str = "main",
) -> None:
    """Print store metadata to stdout.

    Parameters
    ----------
    store_path : str | Path
        Path to an Icechunk store.
    section : str | None
        Section to show (None = full report).
    branch : str
        Store branch.
    """
    path = Path(store_path)
    if not metadata_exists(path, branch):
        print(f"No metadata found in {path}")
        return
    meta = read_metadata(path, branch)
    print(format_metadata(meta, section))