Source code for IceAdvect.io.geotiff

#!/usr/bin/env python
"""
geotiff.py
Written by Tyler Sutterley (04/2026)

Reads geotiff files as xarray Datasets

PYTHON DEPENDENCIES:
    pyproj: Python interface to PROJ library
        https://pypi.org/project/pyproj/
        https://pyproj4.github.io/pyproj/
    rioxarray: N-D labeled arrays and datasets in Python
        https://docs.xarray.dev/en/stable/

UPDATE HISTORY:
    Updated 04/2026: added lineage attribute to save filename(s)
    Updated 02/2026: added logging information when opening files
    Written 01/2026
"""

from __future__ import division, annotations

import os
import re
import pyproj
import logging
import pathlib
import warnings
import numpy as np
import xarray as xr
import timescale.time
import IceAdvect.utilities
from IceAdvect.io.dataset import combine_attrs

# attempt imports
dask = IceAdvect.utilities.import_dependency("dask")
dask_available = IceAdvect.utilities.dependency_available("dask")
rioxarray = IceAdvect.utilities.import_dependency("rioxarray")
rioxarray.merge = IceAdvect.utilities.import_dependency("rioxarray.merge")

# set environmental variable for anonymous s3 access
os.environ["AWS_NO_SIGN_REQUEST"] = "YES"
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)


[docs] def open_mfdataset( filenames: list[str] | list[pathlib.Path], mapping: dict, **kwargs, ) -> xr.Dataset: """Open a geotiff file as an xarray Dataset Parameters ---------- filenames: list of str or pathlib.Path Path to geotiff file mapping: dict Dictionary mapping standard variable names to patterns for the file chunks: int, dict, str, or None, default None variable chunk sizes for dask (see ``rioxarray.open_rasterio``) Returns ------- ds: xr.Dataset xarray Dataset """ # verify that filename is iterable if isinstance(filenames, (str, pathlib.Path)): filenames = [filenames] # read the geotiff files as an xarray Datasets datasets = [] for f in filenames: # determine variable name from mapping variable, pattern = parse_file(f, mapping) # append dataset to list datasets.append( open_dataset(f, variable=variable, pattern=pattern, **kwargs) ) # merge Datasets darr = xr.merge( datasets, combine_attrs=combine_attrs, compat="override", join="override", ) # return xarray Dataset return darr
[docs] def open_dataset( filename: list, variable: str = "variable", **kwargs, ) -> xr.Dataset: """Open a geotiff file as an xarray Dataset Parameters ---------- filename: str Path to geotiff file variable: str, default "variable" variable name for the file mapping: dict or None, default None Dictionary mapping standard variable names to those in the file chunks: int, dict, str, or None, default None variable chunk sizes for dask (see ``rioxarray.open_rasterio``) Returns ------- ds: xr.Dataset xarray Dataset """ mapping = kwargs.get("mapping", None) if mapping is not None: variable, pattern = parse_file(filename, mapping) kwargs["pattern"] = pattern # read the geotiff file as an xarray DataArray darr = open_dataarray(filename, **kwargs) # convert DataArray to Dataset ds = darr.to_dataset(name=variable) # add lineage attribute ds.attrs["lineage"] = pathlib.Path(filename).name # return the xarray Dataset return ds
# PURPOSE: read a list of model files
[docs] def open_mfdataarray( filenames: list[str] | list[pathlib.Path], parallel: bool = False, **kwargs, ): """ Open multiple geotiff files Parameters ---------- filenames: list of str or pathlib.Path list of files parallel: bool, default False Open files in parallel using ``dask.delayed`` kwargs: dict additional keyword arguments for opening files Returns ------- darr: xarray.DataArray xarray DataArray """ # verify that filename is iterable if isinstance(filenames, (str, pathlib.Path)): filenames = [filenames] # read each file as xarray DataArray and append to list if parallel and dask_available: opener = dask.delayed(open_dataarray) else: opener = open_dataarray # read each file as xarray dataset and append to list dataarrays = [opener(f, **kwargs) for f in filenames] # read datasets as dask arrays if parallel and dask_available: (dataarrays,) = dask.compute(dataarrays) # merge DataArray darr = xr.merge(dataarrays, compat="override", join="override") # return xarray DataArray return darr
[docs] def open_dataarray( filename: str, longterm: bool = False, pattern: str | None = None, chunks: int | dict | str | None = None, **kwargs, ) -> xr.DataArray: """Open a geotiff file as an xarray DataArray Parameters ---------- filename: str Path to geotiff file longterm: bool, default False Datafile is a long-term average pattern: str or None, default None Regular expression pattern for extracting time information chunks: int, dict, str, or None, default None variable chunk sizes for dask (see ``rioxarray.open_rasterio``) Returns ------- darr: xr.DataArray xarray DataArray """ # get coordinate reference system (CRS) information from kwargs crs = kwargs.get("crs", None) # verbose logging logging.debug(f"Opening GeoTIFF file: {filename}") # open the geotiff file using rioxarray darr = rioxarray.open_rasterio( filename, masked=True, chunks=chunks, **kwargs ) # name of the input file name = IceAdvect.utilities.Path(filename).name # assign time dimension for long-term averages or from filename pattern if longterm: pass elif pattern and re.search(pattern, name, re.I): # extract start and end time from filename _, start, end, _ = re.findall(pattern, name, re.I).pop() # parse strings into datetime objects start_time = timescale.time.parse(start) end_time = timescale.time.parse(end) time_array = np.array([start_time, end_time], dtype="datetime64[D]") # convert to timescale objects and take the mean ts = timescale.from_datetime(time_array) darr["time"] = xr.DataArray(ts.mean().to_datetime(), dims="band") darr = darr.swap_dims({"band": "time"}) # attach coordinate reference system (CRS) information if crs is not None: darr.attrs["crs"] = pyproj.CRS.from_user_input(crs).to_dict() else: crs_wkt = darr.spatial_ref.attrs["crs_wkt"] darr.attrs["crs"] = pyproj.CRS.from_user_input(crs_wkt).to_dict() # return xarray DataArray return darr
[docs] def parse_file(filename: str | pathlib.Path, mapping: dict): """ Determine variable name from filename using mapping Parameters ---------- filename: str or pathlib.Path Path to file mapping: dict Dictionary mapping standard variable names to patterns for the file Returns ------- variable: str Variable name for the file pattern: str or None Regular expression pattern for extracting time information """ # default variable name and pattern variable = "variable" pattern = None # determine pattern for extracting time information for k, v in mapping.items(): if re.search(v, str(filename), re.IGNORECASE): variable, pattern = k, v break # return variable name and pattern return variable, pattern