#!/usr/bin/env python
u"""
spatial.py
Written by Tyler Sutterley (12/2024)
Utilities for reading, writing and operating on spatial data
PYTHON DEPENDENCIES:
numpy: Scientific Computing Tools For Python
https://numpy.org
https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
UPDATE HISTORY:
Updated 12/2024: add latitude and longitude as potential dimension names
Updated 11/2024: added function to calculate the altitude and azimuth
Updated 09/2024: deprecation fix case where an array is output to scalars
Updated 08/2024: changed from 'geotiff' to 'GTiff' and 'cog' formats
added functions to convert to and from East-North-Up coordinates
Updated 07/2024: added functions to convert to and from DMS
Updated 06/2024: added function to write parquet files with metadata
Updated 05/2024: added function to read from parquet files
allowing for decoding of the geometry column from WKB
deprecation update to use exceptions with osgeo osr
Updated 04/2024: use timescale for temporal operations
use wrapper to importlib for optional dependencies
Updated 03/2024: can calculate polar stereographic distortion for distances
Updated 02/2024: changed class name for ellipsoid parameters to datum
Updated 10/2023: can read from netCDF4 or HDF5 variable groups
apply no formatting to columns in ascii file output
Updated 09/2023: add function to invert field mapping keys and values
use datetime64[ns] for parsing dates from ascii files
Updated 08/2023: remove possible crs variables from output fields list
place PyYAML behind try/except statement to reduce build size
Updated 05/2023: use datetime parser within pyTMD.time module
Updated 04/2023: copy inputs in cartesian to not modify original arrays
added iterative methods for converting from cartesian to geodetic
allow netCDF4 and HDF5 outputs to be appended to existing files
using pathlib to define and expand paths
Updated 03/2023: add basic variable typing to function inputs
Updated 02/2023: use outputs from constants class for WGS84 parameters
include more possible dimension names for gridded and drift outputs
Updated 01/2023: added default field mapping for reading from netCDF4/HDF5
split netCDF4 output into separate functions for grid and drift types
Updated 12/2022: add software information to output HDF5 and netCDF4
Updated 11/2022: place some imports within try/except statements
added encoding for writing ascii files
use f-strings for formatting verbose or ascii output
Updated 10/2022: added datetime parser for ascii time columns
Updated 06/2022: added field_mapping options to netCDF4 and HDF5 reads
added from_file wrapper function to read from particular formats
Updated 04/2022: add option to reduce input GDAL raster datasets
updated docstrings to numpy documentation format
use gzip virtual filesystem for reading compressed geotiffs
include utf-8 encoding in reads to be windows compliant
Updated 03/2022: add option to specify output GDAL driver
Updated 01/2022: use iteration breaks in convert ellipsoid function
remove fill_value attribute after creating netCDF4 and HDF5 variables
Updated 11/2021: added empty cases to netCDF4 and HDF5 output for crs
try to get grid mapping attributes from netCDF4 and HDF5
Updated 10/2021: add pole case in stereographic area scale calculation
using python logging for handling verbose output
Updated 09/2021: can calculate height differences between ellipsoids
Updated 07/2021: added function for determining input variable type
Updated 03/2021: added polar stereographic area scale calculation
add routines for converting to and from cartesian coordinates
replaced numpy bool/int to prevent deprecation warnings
Updated 01/2021: add streaming from bytes for ascii, netCDF4, HDF5, geotiff
set default time for geotiff files to 0
Updated 12/2020: added module for converting ellipsoids
Updated 11/2020: output data as masked arrays if containing fill values
add functions to read from and write to geotiff image formats
Written 09/2020
"""
from __future__ import annotations
import re
import io
import copy
import gzip
import json
import uuid
import logging
import pathlib
import warnings
import datetime
import collections
import numpy as np
import timescale.time
import grounding_zones.version
from grounding_zones.utilities import import_dependency
# attempt imports
osgeo = import_dependency('osgeo')
osgeo.gdal = import_dependency('osgeo.gdal')
osgeo.osr = import_dependency('osgeo.osr')
osgeo.gdalconst = import_dependency('osgeo.gdalconst')
h5py = import_dependency('h5py')
netCDF4 = import_dependency('netCDF4')
pd = import_dependency('pandas')
pyarrow = import_dependency('pyarrow')
pyarrow.parquet = import_dependency('pyarrow.parquet')
shapely = import_dependency('shapely')
shapely.geometry = import_dependency('shapely.geometry')
yaml = import_dependency('yaml')
__all__ = [
"case_insensitive_filename",
"from_file",
"from_ascii",
"from_netCDF4",
"from_HDF5",
"from_geotiff",
"from_parquet",
"to_file",
"to_ascii",
"to_netCDF4",
"_drift_netCDF4",
"_grid_netCDF4",
"_time_series_netCDF4",
"to_HDF5",
"to_geotiff",
"to_parquet",
"expand_dims",
"default_field_mapping",
"inverse_mapping",
]
[docs]
def case_insensitive_filename(filename: str | pathlib.Path):
"""
Searches a directory for a filename without case dependence
Parameters
----------
filename: str
input filename
"""
# check if file presently exists with input case
filename = pathlib.Path(filename).expanduser().absolute()
if not filename.exists():
# search for filename without case dependence
f = [f.name for f in filename.parent.iterdir() if
re.match(filename.name, f.name, re.I)]
# raise error if no file found
if not f:
raise FileNotFoundError(str(filename))
filename = filename.with_name(f.pop())
# return the matched filename
return filename
[docs]
def from_file(filename: str, format: str, **kwargs):
"""
Wrapper function for reading data from an input format
Parameters
----------
filename: str
full path of input file
format: str
format of input file
**kwargs: dict
Keyword arguments for file reader
"""
# read input file to extract spatial coordinates and data
if (format == 'ascii'):
dinput = from_ascii(filename, **kwargs)
elif (format == 'netCDF4'):
dinput = from_netCDF4(filename, **kwargs)
elif (format == 'HDF5'):
dinput = from_HDF5(filename, **kwargs)
elif format in ('GTiff','cog'):
dinput = from_geotiff(filename, **kwargs)
elif (format == 'parquet'):
dinput = from_parquet(filename, **kwargs)
else:
raise ValueError(f'Invalid format {format}')
return dinput
[docs]
def from_ascii(filename: str, **kwargs):
"""
Read data from an ascii file
Parameters
----------
filename: str
full path of input ascii file
compression: str or NoneType, default None
file compression type
columns: list, default ['time', 'y', 'x', 'data']
column names of ascii file
delimiter: str, default ','
Delimiter for csv or ascii files
header: int, default 0
header lines to skip from start of file
parse_dates: bool, default False
Try parsing the time column
"""
# set default keyword arguments
kwargs.setdefault('compression', None)
kwargs.setdefault('columns', ['time', 'y', 'x', 'data'])
kwargs.setdefault('delimiter', ',')
kwargs.setdefault('header', 0)
kwargs.setdefault('parse_dates', False)
# print filename
logging.info(str(filename))
# get column names
columns = copy.copy(kwargs['columns'])
# open the ascii file and extract contents
if (kwargs['compression'] == 'gzip'):
# read input ascii data from gzip compressed file and split lines
filename = case_insensitive_filename(filename)
with gzip.open(filename, 'r') as f:
file_contents = f.read().decode('ISO-8859-1').splitlines()
elif (kwargs['compression'] == 'bytes'):
# read input file object and split lines
file_contents = filename.read().splitlines()
else:
# read input ascii file (.txt, .asc) and split lines
filename = case_insensitive_filename(filename)
with open(filename, mode='r', encoding='utf8') as f:
file_contents = f.read().splitlines()
# number of lines in the file
file_lines = len(file_contents)
# compile regular expression operator for extracting numerical values
# from input ascii files of spatial data
regex_pattern = r'[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[EeD][+-]?\d+)?'
rx = re.compile(regex_pattern, re.VERBOSE)
# check if header has a known format
if (str(kwargs['header']).upper() == 'YAML'):
# counts the number of lines in the header
YAML = False
count = 0
# Reading over header text
while (YAML is False) & (count < file_lines):
# file line at count
line = file_contents[count]
# if End of YAML Header is found: set YAML flag
YAML = bool(re.search(r"\# End of YAML header", line))
# add 1 to counter
count += 1
# parse the YAML header (specifying yaml loader)
YAML_HEADER = yaml.load('\n'.join(file_contents[:count]),
Loader=yaml.BaseLoader)
# output spatial data and attributes
dinput = {}
# copy global attributes
dinput['attributes'] = YAML_HEADER['header']['global_attributes']
# allocate for each variable and copy variable attributes
for c in columns:
if (c == 'time') and kwargs['parse_dates']:
dinput[c] = np.zeros((file_lines-count), dtype='datetime64[ns]')
else:
dinput[c] = np.zeros((file_lines-count))
dinput['attributes'][c] = YAML_HEADER['header']['variables'][c]
# update number of file lines to skip for reading data
header = int(count)
else:
# allocate for each variable and variable attributes
dinput = {}
header = int(kwargs['header'])
for c in columns:
if (c == 'time') and kwargs['parse_dates']:
dinput[c] = np.zeros((file_lines-header), dtype='datetime64[ns]')
else:
dinput[c] = np.zeros((file_lines-header))
dinput['attributes'] = {c:dict() for c in columns}
# extract spatial data array
# for each line in the file
for i, line in enumerate(file_contents[header:]):
# extract columns of interest and assign to dict
# convert fortran exponentials if applicable
if kwargs['delimiter']:
column = {c:l.replace('D', 'E') for c, l in
zip(columns, line.split(kwargs['delimiter']))}
else:
column = {c:r.replace('D', 'E') for c, r in
zip(columns, rx.findall(line))}
# copy variables from column dict to output dictionary
for c in columns:
if (c == 'time') and kwargs['parse_dates']:
dinput[c][i] = timescale.time.parse(column[c])
else:
dinput[c][i] = np.float64(column[c])
# convert to masked array if fill values
if 'data' in dinput.keys() and '_FillValue' in dinput['attributes']['data'].keys():
dinput['data'] = np.ma.asarray(dinput['data'])
dinput['data'].fill_value = dinput['attributes']['data']['_FillValue']
dinput['data'].mask = (dinput['data'].data == dinput['data'].fill_value)
# return the spatial variables
return dinput
[docs]
def from_netCDF4(filename: str, **kwargs):
"""
Read data from a netCDF4 file
Parameters
----------
filename: str
full path of input netCDF4 file
compression: str or NoneType, default None
file compression type
group: str or NoneType, default None
netCDF4 variable group
timename: str, default 'time'
name for time-dimension variable
xname: str, default 'lon'
name for x-dimension variable
yname: str, default 'lat'
name for y-dimension variable
varname: str, default 'data'
name for data variable
field_mapping: dict, default {}
mapping between output variables and input netCDF4
"""
# set default keyword arguments
kwargs.setdefault('compression', None)
kwargs.setdefault('group', None)
kwargs.setdefault('timename', 'time')
kwargs.setdefault('xname', 'lon')
kwargs.setdefault('yname', 'lat')
kwargs.setdefault('varname', 'data')
kwargs.setdefault('field_mapping', {})
# read data from netCDF4 file
# Open the NetCDF4 file for reading
if (kwargs['compression'] == 'gzip'):
# read as in-memory (diskless) netCDF4 dataset
with gzip.open(case_insensitive_filename(filename), 'r') as f:
fileID = netCDF4.Dataset(uuid.uuid4().hex, memory=f.read())
elif (kwargs['compression'] == 'bytes'):
# read as in-memory (diskless) netCDF4 dataset
fileID = netCDF4.Dataset(uuid.uuid4().hex, memory=filename.read())
else:
# read netCDF4 dataset
fileID = netCDF4.Dataset(case_insensitive_filename(filename), 'r')
# Output NetCDF file information
logging.info(fileID.filepath())
logging.info(list(fileID.variables.keys()))
# create python dictionary for output variables and attributes
dinput = {}
dinput['attributes'] = {}
# get attributes for the file
for attr in ['title', 'description', 'projection']:
# try getting the attribute
try:
ncattr, = [s for s in fileID.ncattrs() if re.match(attr, s, re.I)]
dinput['attributes'][attr] = fileID.getncattr(ncattr)
except (ValueError, AttributeError):
pass
# list of attributes to attempt to retrieve from included variables
attributes_list = ['description', 'units', 'long_name', 'calendar',
'standard_name', 'grid_mapping', '_FillValue']
# mapping between netCDF4 variable names and output names
if not kwargs['field_mapping']:
kwargs['field_mapping']['x'] = copy.copy(kwargs['xname'])
kwargs['field_mapping']['y'] = copy.copy(kwargs['yname'])
if kwargs['varname'] is not None:
kwargs['field_mapping']['data'] = copy.copy(kwargs['varname'])
if kwargs['timename'] is not None:
kwargs['field_mapping']['time'] = copy.copy(kwargs['timename'])
# check if reading from root group or sub-group
group = fileID.groups[kwargs['group']] if kwargs['group'] else fileID
# for each variable
for key, nc in kwargs['field_mapping'].items():
# Getting the data from each NetCDF variable
dinput[key] = group.variables[nc][:]
# get attributes for the included variables
dinput['attributes'][key] = {}
for attr in attributes_list:
# try getting the attribute
try:
ncattr, = [s for s in group.variables[nc].ncattrs()
if re.match(attr, s, re.I)]
dinput['attributes'][key][attr] = \
group.variables[nc].getncattr(ncattr)
except (ValueError, AttributeError):
pass
# get projection information if there is a grid_mapping attribute
if 'data' in dinput.keys() and 'grid_mapping' in dinput['attributes']['data'].keys():
# try getting the attribute
grid_mapping = dinput['attributes']['data']['grid_mapping']
# get coordinate reference system attributes
dinput['attributes']['crs'] = {}
for att_name in group[grid_mapping].ncattrs():
dinput['attributes']['crs'][att_name] = \
group.variables[grid_mapping].getncattr(att_name)
# get the spatial projection reference information from wkt
# and overwrite the file-level projection attribute (if existing)
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.ImportFromWkt(dinput['attributes']['crs']['crs_wkt'])
dinput['attributes']['projection'] = srs.ExportToProj4()
# convert to masked array if fill values
if 'data' in dinput.keys() and '_FillValue' in dinput['attributes']['data'].keys():
dinput['data'] = np.ma.asarray(dinput['data'])
dinput['data'].fill_value = dinput['attributes']['data']['_FillValue']
dinput['data'].mask = (dinput['data'].data == dinput['data'].fill_value)
# Closing the NetCDF file
fileID.close()
# return the spatial variables
return dinput
[docs]
def from_HDF5(filename: str | pathlib.Path, **kwargs):
"""
Read data from a HDF5 file
Parameters
----------
filename: str
full path of input HDF5 file
compression: str or NoneType, default None
file compression type
group: str or NoneType, default None
netCDF4 variable group
timename: str, default 'time'
name for time-dimension variable
xname: str, default 'lon'
name for x-dimension variable
yname: str, default 'lat'
name for y-dimension variable
varname: str, default 'data'
name for data variable
field_mapping: dict, default {}
mapping between output variables and input HDF5
"""
# set default keyword arguments
kwargs.setdefault('compression', None)
kwargs.setdefault('group', None)
kwargs.setdefault('timename', 'time')
kwargs.setdefault('xname', 'lon')
kwargs.setdefault('yname', 'lat')
kwargs.setdefault('varname', 'data')
kwargs.setdefault('field_mapping', {})
# read data from HDF5 file
# Open the HDF5 file for reading
if (kwargs['compression'] == 'gzip'):
# read gzip compressed file and extract into in-memory file object
with gzip.open(case_insensitive_filename(filename), 'r') as f:
fid = io.BytesIO(f.read())
# set filename of BytesIO object
fid.filename = filename.name
# rewind to start of file
fid.seek(0)
# read as in-memory (diskless) HDF5 dataset from BytesIO object
fileID = h5py.File(fid, 'r')
elif (kwargs['compression'] == 'bytes'):
# read as in-memory (diskless) HDF5 dataset
fileID = h5py.File(filename, 'r')
else:
# read HDF5 dataset
fileID = h5py.File(case_insensitive_filename(filename), 'r')
# Output HDF5 file information
logging.info(fileID.filename)
logging.info(list(fileID.keys()))
# create python dictionary for output variables and attributes
dinput = {}
dinput['attributes'] = {}
# get attributes for the file
for attr in ['title', 'description', 'projection']:
# try getting the attribute
try:
dinput['attributes'][attr] = fileID.attrs[attr]
except (KeyError, AttributeError):
pass
# list of attributes to attempt to retrieve from included variables
attributes_list = ['description', 'units', 'long_name', 'calendar',
'standard_name', 'grid_mapping', '_FillValue']
# mapping between HDF5 variable names and output names
if not kwargs['field_mapping']:
kwargs['field_mapping']['x'] = copy.copy(kwargs['xname'])
kwargs['field_mapping']['y'] = copy.copy(kwargs['yname'])
if kwargs['varname'] is not None:
kwargs['field_mapping']['data'] = copy.copy(kwargs['varname'])
if kwargs['timename'] is not None:
kwargs['field_mapping']['time'] = copy.copy(kwargs['timename'])
# check if reading from root group or sub-group
group = fileID[kwargs['group']] if kwargs['group'] else fileID
# for each variable
for key, h5 in kwargs['field_mapping'].items():
# Getting the data from each HDF5 variable
dinput[key] = np.copy(group[h5][:])
# get attributes for the included variables
dinput['attributes'][key] = {}
for attr in attributes_list:
# try getting the attribute
try:
dinput['attributes'][key][attr] = group[h5].attrs[attr]
except (KeyError, AttributeError):
pass
# get projection information if there is a grid_mapping attribute
if 'data' in dinput.keys() and 'grid_mapping' in dinput['attributes']['data'].keys():
# try getting the attribute
grid_mapping = dinput['attributes']['data']['grid_mapping']
# get coordinate reference system attributes
dinput['attributes']['crs'] = {}
for att_name, att_val in group[grid_mapping].attrs.items():
dinput['attributes']['crs'][att_name] = att_val
# get the spatial projection reference information from wkt
# and overwrite the file-level projection attribute (if existing)
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.ImportFromWkt(dinput['attributes']['crs']['crs_wkt'])
dinput['attributes']['projection'] = srs.ExportToProj4()
# convert to masked array if fill values
if 'data' in dinput.keys() and '_FillValue' in dinput['attributes']['data'].keys():
dinput['data'] = np.ma.asarray(dinput['data'])
dinput['data'].fill_value = dinput['attributes']['data']['_FillValue']
dinput['data'].mask = (dinput['data'].data == dinput['data'].fill_value)
# Closing the HDF5 file
fileID.close()
# return the spatial variables
return dinput
[docs]
def from_geotiff(filename: str, **kwargs):
"""
Read data from a geotiff file
Parameters
----------
filename: str
full path of input geotiff file
compression: str or NoneType, default None
file compression type
bounds: list or NoneType, default bounds
extent of the file to read: ``[xmin, xmax, ymin, ymax]``
"""
# set default keyword arguments
kwargs.setdefault('compression', None)
kwargs.setdefault('bounds', None)
# Open the geotiff file for reading
if (kwargs['compression'] == 'gzip'):
# read as GDAL gzip virtual geotiff dataset
mmap_name = f"/vsigzip/{str(case_insensitive_filename(filename))}"
ds = osgeo.gdal.Open(mmap_name)
elif (kwargs['compression'] == 'bytes'):
# read as GDAL memory-mapped (diskless) geotiff dataset
mmap_name = f"/vsimem/{uuid.uuid4().hex}"
osgeo.gdal.FileFromMemBuffer(mmap_name, filename.read())
ds = osgeo.gdal.Open(mmap_name)
else:
# read geotiff dataset
ds = osgeo.gdal.Open(str(case_insensitive_filename(filename)),
osgeo.gdalconst.GA_ReadOnly)
# print geotiff file if verbose
logging.info(str(filename))
# create python dictionary for output variables and attributes
dinput = {}
dinput['attributes'] = {c:dict() for c in ['x', 'y', 'data']}
# get the spatial projection reference information
srs = ds.GetSpatialRef()
dinput['attributes']['projection'] = srs.ExportToProj4()
dinput['attributes']['wkt'] = srs.ExportToWkt()
# get dimensions
xsize = ds.RasterXSize
ysize = ds.RasterYSize
bsize = ds.RasterCount
# get geotiff info
info_geotiff = ds.GetGeoTransform()
dinput['attributes']['spacing'] = (info_geotiff[1], info_geotiff[5])
# calculate image extents
xmin = info_geotiff[0]
ymax = info_geotiff[3]
xmax = xmin + (xsize-1)*info_geotiff[1]
ymin = ymax + (ysize-1)*info_geotiff[5]
# x and y pixel center coordinates (converted from upper left)
x = xmin + info_geotiff[1]/2.0 + np.arange(xsize)*info_geotiff[1]
y = ymax + info_geotiff[5]/2.0 + np.arange(ysize)*info_geotiff[5]
# if reducing to specified bounds
if kwargs['bounds'] is not None:
# reduced x and y limits
xlimits = (kwargs['bounds'][0], kwargs['bounds'][1])
ylimits = (kwargs['bounds'][2], kwargs['bounds'][3])
# Specify offset and rows and columns to read
xoffset = int((xlimits[0] - xmin)/info_geotiff[1])
yoffset = int((ymax - ylimits[1])/np.abs(info_geotiff[5]))
xcount = int((xlimits[1] - xlimits[0])/info_geotiff[1]) + 1
ycount = int((ylimits[1] - ylimits[0])/np.abs(info_geotiff[5])) + 1
# reduced x and y pixel center coordinates
dinput['x'] = x[slice(xoffset, xoffset + xcount, None)]
dinput['y'] = y[slice(yoffset, yoffset + ycount, None)]
# read reduced image with GDAL
dinput['data'] = ds.ReadAsArray(xoff=xoffset, yoff=yoffset,
xsize=xcount, ysize=ycount)
# reduced image extent (converted back to upper left)
xmin = np.min(dinput['x']) - info_geotiff[1]/2.0
xmax = np.max(dinput['x']) - info_geotiff[1]/2.0
ymin = np.min(dinput['y']) - info_geotiff[5]/2.0
ymax = np.max(dinput['y']) - info_geotiff[5]/2.0
else:
# x and y pixel center coordinates
dinput['x'] = np.copy(x)
dinput['y'] = np.copy(y)
# read full image with GDAL
dinput['data'] = ds.ReadAsArray()
# image extent
dinput['attributes']['extent'] = (xmin, xmax, ymin, ymax)
# set default time to zero for each band
dinput.setdefault('time', np.zeros((bsize)))
# check if image has fill values
dinput['data'] = np.ma.asarray(dinput['data'])
dinput['data'].mask = np.zeros_like(dinput['data'], dtype=bool)
if ds.GetRasterBand(1).GetNoDataValue():
# mask invalid values
dinput['data'].fill_value = ds.GetRasterBand(1).GetNoDataValue()
# create mask array for bad values
dinput['data'].mask[:] = (dinput['data'].data == dinput['data'].fill_value)
# set attribute for fill value
dinput['attributes']['data']['_FillValue'] = dinput['data'].fill_value
# close the dataset
ds = None
# return the spatial variables
return dinput
[docs]
def from_parquet(filename: str, **kwargs):
"""
Read data from a parquet file
Parameters
----------
filename: str
full path of input parquet file
index: str, default 'time'
name of index column
columns: list or None, default None
column names of parquet file
primary_column: str, default 'geometry'
default geometry column in geoparquet files
geometry_encoding: str, default 'WKB'
default encoding for geoparquet files
"""
# set default keyword arguments
kwargs.setdefault('index', 'time')
kwargs.setdefault('columns', None)
kwargs.setdefault('primary_column', 'geometry')
kwargs.setdefault('geometry_encoding', 'WKB')
filename = case_insensitive_filename(filename)
# read input parquet file
dinput = pd.read_parquet(filename)
# reset the dataframe index if not a range index
if not isinstance(dinput.index, pd.RangeIndex):
dinput.reset_index(inplace=True, names=kwargs['index'])
# output parquet file information
attr = {}
# get parquet file metadata
metadata = pyarrow.parquet.read_metadata(filename).metadata
# decode parquet metadata from JSON
for att_name, val in metadata.items():
try:
att_val = json.loads(val.decode('utf-8'))
attr[att_name.decode('utf-8')] = att_val
except Exception as exc:
pass
# check if parquet file contains geometry metadata
attr['geoparquet'] = False
primary_column = kwargs['primary_column']
encoding = kwargs['geometry_encoding']
if 'geo' in attr.keys():
# extract crs and encoding from geoparquet metadata
primary_column = attr['geo']['primary_column']
crs_metadata = attr['geo']['columns'][primary_column]['crs']
encoding = attr['geo']['columns'][primary_column]['encoding']
attr['geometry_encoding'] = encoding
# create spatial reference object from PROJJSON
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.SetFromUserInput(json.dumps(crs_metadata))
# add projection information to attributes
attr['projection'] = srs.ExportToProj4()
attr['wkt'] = srs.ExportToWkt()
elif 'pyTMD' in attr.keys():
# extract crs from pyTMD metadata
crs_metadata = attr['pyTMD']['crs']
# create spatial reference object from PROJJSON
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.SetFromUserInput(json.dumps(crs_metadata))
# add projection information to attributes
attr['projection'] = srs.ExportToProj4()
attr['wkt'] = srs.ExportToWkt()
# extract x and y coordinates
if (encoding == 'WKB') and (primary_column in dinput.keys()):
# set as geoparquet file
attr['geoparquet'] = True
# decode geometry column from WKB
geometry = shapely.from_wkb(dinput[primary_column].values)
dinput['x'] = shapely.get_x(geometry)
dinput['y'] = shapely.get_y(geometry)
# remap columns to default names
if kwargs['columns'] is not None:
field_mapping = default_field_mapping(kwargs['columns'])
remap = inverse_mapping(field_mapping)
dinput.rename(columns=remap, inplace=True)
# return the data and attributes
dinput.attrs = copy.copy(attr)
return dinput
[docs]
def to_file(
output: dict,
attributes: dict,
filename: str | pathlib.Path,
format: str,
**kwargs
):
"""
Wrapper function for writing data to an output format
Parameters
----------
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
filename: str or pathlib.Path,
full path of output file
format: str
format of output file
**kwargs: dict
Keyword arguments for file writer
"""
# read input file to extract spatial coordinates and data
if (format == 'ascii'):
to_ascii(output, attributes, filename, **kwargs)
elif (format == 'netCDF4'):
to_netCDF4(output, attributes, filename, **kwargs)
elif (format == 'HDF5'):
to_HDF5(output, attributes, filename, **kwargs)
elif format in ('GTiff','cog'):
to_geotiff(output, attributes, filename, **kwargs)
elif (format == 'parquet'):
to_parquet(output, attributes, filename, **kwargs)
else:
raise ValueError(f'Invalid format {format}')
[docs]
def to_ascii(
output: dict,
attributes: dict,
filename: str | pathlib.Path,
**kwargs
):
"""
Write data to an ascii file
Parameters
----------
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
filename: str or pathlib.Path
full path of output ascii file
delimiter: str, default ','
delimiter for output spatial file
columns: list, default ['time', 'y', 'x', 'data']
column names of ascii file
header: bool, default False
create a YAML header with data attributes
"""
# set default keyword arguments
kwargs.setdefault('delimiter', ',')
kwargs.setdefault('columns', ['time', 'lat', 'lon', 'tide'])
kwargs.setdefault('header', False)
# get column names
columns = copy.copy(kwargs['columns'])
# output filename
filename = pathlib.Path(filename).expanduser().absolute()
logging.info(str(filename))
# open the output file
fid = filename.open(mode='w', encoding='utf8')
# create a column stack arranging data in column order
data_stack = np.c_[[output[col] for col in columns]]
ncol, nrow = np.shape(data_stack)
# print YAML header to top of file
if kwargs['header']:
fid.write('{0}:\n'.format('header'))
# data dimensions
fid.write('\n {0}:\n'.format('dimensions'))
fid.write(' {0:22}: {1:d}\n'.format('time', nrow))
# non-standard attributes
fid.write(' {0}:\n'.format('non-standard_attributes'))
# data format
fid.write(' {0:22}: ({1:d}f0.8)\n'.format('formatting_string', ncol))
fid.write('\n')
# global attributes
fid.write('\n {0}:\n'.format('global_attributes'))
today = datetime.datetime.now().isoformat()
fid.write(' {0:22}: {1}\n'.format('date_created', today))
# print variable descriptions to YAML header
fid.write('\n {0}:\n'.format('variables'))
# print YAML header with variable attributes
for i, v in enumerate(columns):
fid.write(' {0:22}:\n'.format(v))
for atn, atv in attributes[v].items():
fid.write(' {0:20}: {1}\n'.format(atn, atv))
# add precision and column attributes for ascii yaml header
fid.write(' {0:20}: double_precision\n'.format('precision'))
fid.write(' {0:20}: column {1:d}\n'.format('comment', i+1))
# end of header
fid.write('\n\n# End of YAML header\n')
# write to file for each data point
for line in range(nrow):
line_contents = [f'{d}' for d in data_stack[:, line]]
print(kwargs['delimiter'].join(line_contents), file=fid)
# close the output file
fid.close()
[docs]
def to_netCDF4(
output: dict,
attributes: dict,
filename: str | pathlib.Path,
**kwargs
):
"""
Wrapper function for writing data to a netCDF4 file
Parameters
----------
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
filename: str or pathlib.Path
full path of output netCDF4 file
mode: str, default 'w'
NetCDF file mode
data_type: str, default 'drift'
Input data type
- ``'time series'``
- ``'drift'``
- ``'grid'``
"""
# default arguments
kwargs.setdefault('mode', 'w')
kwargs.setdefault('data_type', 'drift')
# opening NetCDF file for writing
filename = pathlib.Path(filename).expanduser().absolute()
fileID = netCDF4.Dataset(filename, kwargs['mode'], format="NETCDF4")
if kwargs['data_type'] in ('drift',):
kwargs.pop('data_type')
_drift_netCDF4(fileID, output, attributes, **kwargs)
elif kwargs['data_type'] in ('grid',):
kwargs.pop('data_type')
_grid_netCDF4(fileID, output, attributes, **kwargs)
elif kwargs['data_type'] in ('time series',):
kwargs.pop('data_type')
_time_series_netCDF4(fileID, output, attributes, **kwargs)
# add attribute for date created
fileID.date_created = datetime.datetime.now().isoformat()
# add attributes for software information
fileID.software_reference = grounding_zones.version.project_name
fileID.software_version = grounding_zones.version.full_version
# add file-level attributes if applicable
if 'ROOT' in attributes.keys():
# Defining attributes for file
for att_name, att_val in attributes['ROOT'].items():
fileID.setncattr(att_name, att_val)
# Output NetCDF structure information
logging.info(str(filename))
logging.info(list(fileID.variables.keys()))
# Closing the NetCDF file
fileID.close()
[docs]
def _drift_netCDF4(fileID, output: dict, attributes: dict, **kwargs):
"""
Write drift data variables to a netCDF4 file object
Parameters
----------
fileID: obj
open netCDF4 file object
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
"""
# Defining the NetCDF dimensions
fileID.createDimension('time', len(np.atleast_1d(output['time'])))
# defining the NetCDF variables
nc = {}
for key, val in output.items():
if key in fileID.variables:
nc[key] = fileID.variables[key]
elif '_FillValue' in attributes[key].keys():
nc[key] = fileID.createVariable(key, val.dtype, ('time',),
fill_value=attributes[key]['_FillValue'], zlib=True)
attributes[key].pop('_FillValue')
elif val.shape:
nc[key] = fileID.createVariable(key, val.dtype, ('time',))
else:
nc[key] = fileID.createVariable(key, val.dtype, ())
# filling NetCDF variables
nc[key][:] = val
# Defining attributes for variable
for att_name, att_val in attributes[key].items():
nc[key].setncattr(att_name, att_val)
[docs]
def _grid_netCDF4(fileID, output: dict, attributes: dict, **kwargs):
"""
Write gridded data variables to a netCDF4 file object
Parameters
----------
fileID: obj
open netCDF4 file object
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
"""
# output data fields
dimensions = ['t', 'time', 'lon', 'longitude', 'x', 'lat', 'latitude', 'y']
crs = ['crs', 'crs_wkt', 'crs_proj4', 'projection']
fields = sorted(set(output.keys()) - set(dimensions) - set(crs))
# Defining the NetCDF dimensions
reference_fields = [v for v in fields if output[v].ndim == 3]
ny, nx, nt = output[reference_fields[0]].shape
fileID.createDimension('y', ny)
fileID.createDimension('x', nx)
fileID.createDimension('time', nt)
# defining the NetCDF variables
nc = {}
for key, val in output.items():
if key in fileID.variables:
nc[key] = fileID.variables[key]
elif '_FillValue' in attributes[key].keys():
nc[key] = fileID.createVariable(key, val.dtype, ('y', 'x', 'time'),
fill_value=attributes[key]['_FillValue'], zlib=True)
attributes[key].pop('_FillValue')
elif (val.ndim == 3):
nc[key] = fileID.createVariable(key, val.dtype, ('y', 'x', 'time'))
elif (val.ndim == 2):
nc[key] = fileID.createVariable(key, val.dtype, ('y', 'x'))
elif val.shape and (len(val) == ny):
nc[key] = fileID.createVariable(key, val.dtype, ('y',))
elif val.shape and (len(val) == nx):
nc[key] = fileID.createVariable(key, val.dtype, ('x',))
elif val.shape and (len(val) == nt):
nc[key] = fileID.createVariable(key, val.dtype, ('time',))
else:
nc[key] = fileID.createVariable(key, val.dtype, ())
# filling NetCDF variables
nc[key][:] = val
# Defining attributes for variable
for att_name, att_val in attributes[key].items():
nc[key].setncattr(att_name, att_val)
[docs]
def _time_series_netCDF4(fileID, output: dict, attributes: dict, **kwargs):
"""
Write time series data variables to a netCDF4 file object
Parameters
----------
fileID: obj
open netCDF4 file object
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
"""
# output data fields
dimensions = ['t', 'time', 'lon', 'longitude', 'x', 'lat', 'latitude', 'y']
crs = ['crs', 'crs_wkt', 'crs_proj4', 'projection']
fields = sorted(set(output.keys()) - set(dimensions) - set(crs))
# Defining the NetCDF dimensions
reference_fields = [v for v in fields if output[v].ndim == 2]
nstation, nt = output[reference_fields[0]].shape
fileID.createDimension('station', nstation)
fileID.createDimension('time', nt)
# defining the NetCDF variables
nc = {}
for key, val in output.items():
if key in fileID.variables:
nc[key] = fileID.variables[key]
elif '_FillValue' in attributes[key].keys():
nc[key] = fileID.createVariable(key, val.dtype, ('station', 'time'),
fill_value=attributes[key]['_FillValue'], zlib=True)
attributes[key].pop('_FillValue')
elif (val.ndim == 2):
nc[key] = fileID.createVariable(key, val.dtype, ('station', 'time'))
elif val.shape and (len(val) == nt):
nc[key] = fileID.createVariable(key, val.dtype, ('time',))
elif val.shape and (len(val) == nstation):
nc[key] = fileID.createVariable(key, val.dtype, ('station',))
else:
nc[key] = fileID.createVariable(key, val.dtype, ())
# filling NetCDF variables
nc[key][:] = val
# Defining attributes for variable
for att_name, att_val in attributes[key].items():
nc[key].setncattr(att_name, att_val)
[docs]
def to_HDF5(
output: dict,
attributes: dict,
filename: str | pathlib.Path,
**kwargs
):
"""
Write data to a HDF5 file
Parameters
----------
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
filename: str or pathlib.Path
full path of output HDF5 file
mode: str, default 'w'
HDF5 file mode
"""
# set default keyword arguments
kwargs.setdefault('mode', 'w')
# opening HDF5 file for writing
filename = pathlib.Path(filename).expanduser().absolute()
fileID = h5py.File(filename, mode=kwargs['mode'])
# Defining the HDF5 dataset variables
h5 = {}
for key, val in output.items():
if key in fileID:
fileID[key][...] = val[:]
elif '_FillValue' in attributes[key].keys():
h5[key] = fileID.create_dataset(key, val.shape, data=val,
dtype=val.dtype, fillvalue=attributes[key]['_FillValue'],
compression='gzip')
attributes[key].pop('_FillValue')
elif val.shape:
h5[key] = fileID.create_dataset(key, val.shape, data=val,
dtype=val.dtype, compression='gzip')
else:
h5[key] = fileID.create_dataset(key, val.shape,
dtype=val.dtype)
# Defining attributes for variable
for att_name, att_val in attributes[key].items():
h5[key].attrs[att_name] = att_val
# add attribute for date created
fileID.attrs['date_created'] = datetime.datetime.now().isoformat()
# add attributes for software information
fileID.attrs['software_reference'] = grounding_zones.version.project_name
fileID.attrs['software_version'] = grounding_zones.version.full_version
# add file-level attributes if applicable
if 'ROOT' in attributes.keys():
# Defining attributes for file
for att_name, att_val in attributes['ROOT'].items():
fileID.attrs[att_name] = att_val
# Output HDF5 structure information
logging.info(str(filename))
logging.info(list(fileID.keys()))
# Closing the HDF5 file
fileID.close()
[docs]
def to_geotiff(
output: dict,
attributes: dict,
filename: str | pathlib.Path,
**kwargs
):
"""
Write data to a (cloud optimized) geotiff file
Parameters
----------
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
filename: str or pathlib.Path
full path of output geotiff file
varname: str, default 'data'
output variable name
driver: str, default 'cog'
GDAL driver
- ``'GTiff'``: GeoTIFF
- ``'cog'``: Cloud Optimized GeoTIFF
dtype: obj, default osgeo.gdal.GDT_Float64
GDAL data type
options: list, default ['COMPRESS=LZW']
GDAL driver creation options
"""
# set default keyword arguments
kwargs.setdefault('varname', 'data')
kwargs.setdefault('driver', 'cog')
kwargs.setdefault('dtype', osgeo.gdal.GDT_Float64)
kwargs.setdefault('options', ['COMPRESS=LZW'])
varname = copy.copy(kwargs['varname'])
# verify grid dimensions to be iterable
output = expand_dims(output, varname=varname)
# grid shape
ny, nx, nband = np.shape(output[varname])
# output as geotiff or specified driver
driver = osgeo.gdal.GetDriverByName(kwargs['driver'])
# set up the dataset with creation options
filename = pathlib.Path(filename).expanduser().absolute()
ds = driver.Create(str(filename), nx, ny, nband,
kwargs['dtype'], kwargs['options'])
# top left x, w-e pixel resolution, rotation
# top left y, rotation, n-s pixel resolution
xmin, xmax, ymin, ymax = attributes['extent']
dx, dy = attributes['spacing']
ds.SetGeoTransform([xmin, dx, 0, ymax, 0, dy])
# set the spatial projection reference information
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.ImportFromWkt(attributes['wkt'])
# export
ds.SetProjection( srs.ExportToWkt() )
# for each band
for band in range(nband):
# set fill value for band
if '_FillValue' in attributes[varname].keys():
fill_value = attributes[varname]['_FillValue']
ds.GetRasterBand(band+1).SetNoDataValue(fill_value)
# write band to geotiff array
ds.GetRasterBand(band+1).WriteArray(output[varname][:, :, band])
# print filename if verbose
logging.info(str(filename))
# close dataset
ds.FlushCache()
[docs]
def to_parquet(
output: dict,
attributes: dict,
filename: str | pathlib.Path,
**kwargs
):
"""
Write data to a (geo)parquet file
Parameters
----------
output: dict
python dictionary of output data
attributes: dict
python dictionary of output attributes
filename: str or pathlib.Path,
full path of output parquet file
crs: int, default None
coordinate reference system EPSG code
index: bool, default None
write index to parquet file
compression: str, default 'snappy'
file compression type
geoparquet: bool, default False
write geoparquet file
geometry_encoding: str, default 'WKB'
default encoding for geoparquet geometry
primary_column: str, default 'geometry'
default column name for geoparquet geometry
"""
# set default keyword arguments
kwargs.setdefault('crs', None)
kwargs.setdefault('index', None)
kwargs.setdefault('compression', 'snappy')
kwargs.setdefault('schema_version', '1.1.0')
kwargs.setdefault('geoparquet', False)
kwargs.setdefault('geometry_encoding', 'WKB')
kwargs.setdefault('primary_column', 'geometry')
# convert to pandas dataframe
df = pd.DataFrame(output)
attrs = df.attrs.copy()
# add coordinate reference system to attributes
if kwargs['crs'] and isinstance(kwargs['crs'], int):
# create spatial reference object from EPSG code
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.ImportFromEPSG(kwargs['crs'])
# add projection information to attributes
attributes['crs'] = json.loads(srs.ExportToPROJJSON())
elif kwargs['crs'] and isinstance(kwargs['crs'], dict):
# create spatial reference object from PROJJSON
osgeo.osr.UseExceptions()
srs = osgeo.osr.SpatialReference()
srs.SetFromUserInput(json.dumps(kwargs['crs']))
# add projection information to attributes
attributes['crs'] = copy.copy(kwargs['crs'])
# convert spatial coordinates to WKB encoded geometry
if kwargs['geoparquet'] and (kwargs['geometry_encoding'] == 'WKB'):
# get geometry columns
primary_column = kwargs['primary_column']
geometries = ['lon', 'longitude', 'x', 'lat', 'latitude', 'y']
geom_vars = [v for v in geometries if v in output.keys()]
# convert to shapely geometry
points = shapely.points(df[geom_vars[0]], df[geom_vars[1]])
df.drop(columns=geom_vars, inplace=True)
df[primary_column] = shapely.to_wkb(points)
# get bounding box of total set of points
bbox = shapely.MultiPoint(points).bounds
# drop attributes for geometry columns
[attributes.pop(v) for v in geom_vars if v in attributes]
# add attributes for geoparquet
attrs[b"geo"] = attrs.get(b"geo", {})
attrs[b"geo"]["version"] = kwargs['schema_version']
attrs[b"geo"]["primary_column"] = primary_column
attrs[b"geo"]["columns"] = {primary_column: {
"encoding": 'WKB',
"crs": json.loads(srs.ExportToPROJJSON()),
"bbox": bbox,
"covering": {
"bbox": collections.OrderedDict(
xmin=[bbox[0], "xmin"],
ymin=[bbox[1], "ymin"],
xmax=[bbox[2], "xmax"],
ymax=[bbox[3], "ymax"]
)
}
}
}
elif kwargs['geoparquet'] and (kwargs['geometry_encoding'] == 'point'):
raise ValueError('geoarrow encodings are currently unsupported')
# add attribute for date created
attributes['date_created'] = datetime.datetime.now().isoformat()
# add attributes for software information
attributes['software_reference'] = grounding_zones.version.project_name
attributes['software_version'] = grounding_zones.version.full_version
# dump the attributes to encoded JSON-format
attr_metadata = {b"pyTMD": json.dumps(attributes).encode('utf-8')}
for att_name, att_val in attrs.items():
attr_metadata[att_name] = json.dumps(att_val).encode('utf-8')
# convert dataframe to arrow table
table = pyarrow.Table.from_pandas(df,
preserve_index=kwargs['index'])
# update parquet metadata
metadata = table.schema.metadata
metadata.update(attr_metadata)
# replace schema metadata with updated
table = table.replace_schema_metadata(metadata)
# write arrow table to (geo)parquet file
filename = pathlib.Path(filename).expanduser().absolute()
logging.info(str(filename))
pyarrow.parquet.write_table(table, filename,
compression=kwargs['compression']
)
[docs]
def expand_dims(obj: dict, varname: str = 'data'):
"""
Add a singleton dimension to a spatial dictionary if non-existent
Parameters
----------
obj: dict
python dictionary of data
varname: str, default data
variable name to expand
"""
# change time dimensions to be iterableinformation
try:
obj['time'] = np.atleast_1d(obj['time'])
except:
pass
# output spatial with a third dimension
if isinstance(varname, list):
for v in varname:
obj[v] = np.atleast_3d(obj[v])
elif isinstance(varname, str):
obj[varname] = np.atleast_3d(obj[varname])
# return reformed spatial dictionary
return obj
[docs]
def default_field_mapping(variables: list | np.ndarray):
"""
Builds field mappings from a variable list
Parameters
----------
variables: list
Variables from argument parser
- ``time``
- ``yname``
- ``xname``
- ``varname``
Returns
-------
field_mapping: dict
Field mappings for netCDF4/HDF5 read functions
"""
# get each variable name and add to field mapping dictionary
field_mapping = {}
for i, var in enumerate(['time', 'y', 'x', 'data']):
try:
field_mapping[var] = copy.copy(variables[i])
except IndexError as exc:
pass
# return the field mapping
return field_mapping
[docs]
def inverse_mapping(field_mapping):
"""
Reverses the field mappings of a dictionary
Parameters
----------
field_mapping: dict
Field mappings for netCDF4/HDF5 read functions
"""
return field_mapping.__class__(map(reversed, field_mapping.items()))