This notebook shows how to generate a valid STAC collection, which is a requirement to upload research outcomes to the ESA Project Results Repository (PRR). It focuses on generating metadata for a project with multiple data files of different types.
Check the EarthCODE documentation, and PRR STAC introduction example for a more general introduction to STAC and the ESA PRR.
The code below demonstrates how to perform the necessary steps using real data from the ESA project Terrestrial Carbon Community Assimilation System (TCCAS). The focus of TCCAS is the combination of a diverse array of observational data streams with the D&B terrestrial biosphere model into a consistent picture of the terrestrial carbon, water, and energy cycles.
🔗 Check the project website: Terrestrial Carbon Community Assimilation System (TCCAS) – Website
🛢️ TCCAS Dataset: Terrestrial Carbon Community Assimilation System (TCCAS) – Data base: Sodankylä and Lapland region
# import libraries
import xarray as xr
from pystac import Item, Collection
import pystac
from datetime import datetime
from shapely.geometry import box, mapping
from xstac import xarray_to_stac
import glob
import json
import shapely
import numpy as np
import pandas as pd2. Define functions to be reused between regions¶
def add_insitu_data(collectionid, geometry, bbox, start_time, collection, insitu_href):
print(start_time)
item = Item(
id=f"{collectionid}-insitu_package",
geometry=geometry,
datetime=start_time,
bbox=bbox,
properties= {
"license": "CC-BY-4.0",
"description": 'Insitu package with FloX, VOD and Miscellaneous field datasets related to the TCCAS project. ',
}
)
# 3. add an asset (the actual link to the file)
item.add_asset(
key=f'Insitu package', # title can be arbitrary
asset=pystac.Asset(
href=f'./{collectionid}/{insitu_href}',
media_type="application/tar+gzip",
roles=["data"],
)
)
item.validate()
collection.add_item(item)def add_documentation_item(collectionid, geometry, bbox, start_time, collection, ):
# add all the documentation under a single item
item = Item(
id=f"{collectionid}-documentation",
geometry=geometry,
datetime=start_time,
bbox=bbox,
properties= {
"license": "CC-BY-4.0",
"description": 'Documentation for the TCCAS project datasets.',
}
)
item.add_asset(
key=f'TCCAS user manual.', # title can be arbitrary
asset=pystac.Asset(
href=f'./{collectionid}/TCCAS_manual.pdf',
media_type="application/pdf",
roles=["documentation"],
)
)
item.add_asset(
key="Satellite Data Uncertainty analysis Scientific Report", # title can be arbitrary
asset=pystac.Asset(
href=f'./{collectionid}/D7.pdf',
media_type="application/pdf",
roles=["documentation"],
)
)
item.add_asset(
key="Campaign Data User Manual", # title can be arbitrary
asset=pystac.Asset(
href=f'./{collectionid}/D11_CDUM-all_sites.pdf',
media_type="application/pdf",
roles=["documentation"],
)
)
collection.add_item(item)# add an item with multiple model forcing assets
def create_model_forcing_item(collectionid, geometry, bbox, start_time, collection, model_forcing):
item = Item(
id=f"{collectionid}-model_forcing",
geometry=geometry,
datetime=start_time,
bbox=bbox,
properties= {
"license": "CC-BY-4.0",
"description": ' Regional and Site-level model forcing Data Sets for Sodankylä and Lapland region, part of the TCCAS project.',
}
)
for k,v in model_forcing.items():
item.add_asset(
key=k, # title can be arbitrary
asset=pystac.Asset(
href=f'./{collectionid}/{v}',
media_type="application/x-netcdf",
roles=["data"],
)
)
item.validate()
collection.add_item(item)
# some attributes extracted from xarray are not json serialisable and have to be cast to other types.
def convert_to_json_serialisable(attrs):
attrs = attrs.copy()
for attr in attrs.keys():
if isinstance(attrs[attr], np.ndarray):
attrs[attr] = attrs[attr].tolist()
elif str(type(attrs[attr])).__contains__('numpy.int'):
attrs[attr] = int(attrs[attr])
return attrsdef create_items_from_data(collectionid, geometry, bbox, collection, root_url, data_files, region):
for dataset_name, dataset_filepath in data_files.items():
# 1. open the netcdf file
ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
if 'time' in ds.coords:
start_time = ds['time'][0].values
ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
start_time = datetime.fromtimestamp(ts)
end_time = ds['time'][-1].values
ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
end_time = datetime.fromtimestamp(ts)
elif 'yymmddHH' in ds.variables:
string_date = '-'.join(ds['yymmddHH'][0].values.astype(str)[:3])
start_time = datetime.strptime(string_date, '%Y-%m-%d')
string_date = '-'.join(ds['yymmddHH'][-1].values.astype(str)[:3])
end_time = datetime.strptime(string_date, '%Y-%m-%d')
else:
string_date = '-'.join(ds['yymmddHHMMSS'][0].values.astype(int).astype(str)[:3])
start_time = datetime.strptime(string_date, '%Y-%m-%d')
string_date = '-'.join(ds['yymmddHHMMSS'][0].values.astype(int).astype(str)[:3])
end_time = datetime.strptime(string_date, '%Y-%m-%d')
if 'description' in ds.attrs:
description = ds.attrs['description']
else:
description = f'Dataset with variables related to {dataset_name}.'
template = {
"id": f"{collection.id}-{dataset_name.lower().replace(' ', '_')}",
"type": "Feature",
"stac_version": "1.0.0",
"properties": {
"title": dataset_name,
"description": description,
"start_datetime": start_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
"end_datetime": end_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
"region": region,
},
"geometry": geometry,
"bbox": bbox,
"assets": {
"data": {
"href": f"./{collectionid}/{dataset_filepath.split('/')[-1]}", # or local path
"type": "application/x-netcdf",
"roles": ["data"],
"title": dataset_name
}
}
}
# remove numpy values from attrs:
for var in ds.variables:
ds[var].attrs = convert_to_json_serialisable(ds[var].attrs)
if 'lon' in ds.coords:
x_dim, y_dim = 'lon', 'lat'
if 'longitude' in ds.coords:
x_dim, y_dim ='longitude', 'latitude'
elif 'x' in ds.coords:
x_dim, y_dim = 'x', 'y'
else:
x_dim, y_dim = False, False
# 3. Generate the STAC Item
item = xarray_to_stac(
ds,
template,
temporal_dimension="time" if 'time' in ds.variables else False,
x_dimension=x_dim,
y_dimension=y_dim,
reference_system=False
)
# validate and add the STAC Item to the collection
item.validate()
# manually add license after validation since its new
item.properties['license'] = ds.attrs['license']
collection.add_item(item)
2.2 Define ‘Sodankylae and Lapland’ data and links¶
# define dataset names and base url. If the data is locally stored, then you have to adjust these paths.
# create the parent collection
root_url = 'https://lcc.inversion-lab.com/data/eo/'
region = 'sodankylae'
collectionid = "tccas-sodankylae"
bbox = [18.00, 65.00, 32.00, 69.00]
geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
data_time = pd.to_datetime('2021-12-31T00:00:00Z')
data_files = {
"Fraction of absorbed Photosynthetic Active Radiation Leaf Area Index (JRC-TIP)": "/jrc-tip/jrctip_fapar-lai_sodankyla_20110101-20220105.nc",
"Brightness temperature (SMOS TB)": "smos/smos_l3tb/SMOS_L3TB__sodankyla.nc",
"Soil moisture and Vegetation Optical Depth (SMOS SM and SMOS L-VOD)": "smos/smosL2/smosL2_1D_v700_sodankyla_trans.nc",
"Solar Induced Chlorophyll Fluorescence (Sentinel 5P)": "sif/tropomi/Sodankyla_SIF_TROPOMI_final.nc4",
"Slope (ASCAT Slope)": "ascat/local_slope.final/ASCAT_slope_so.nc",
"Photochemical Reflectance Index (MODIS PRI)": "modis/final/PRI_ESTIMATE_SODANKYLA_SINUSOIDAL.nc",
"Land Surface Temperature (MODIS LST)": "modis/final/LST_ESTIMATE_SODANKYLA_SINUSOIDAL.nc",
"Solar Induced Chlorophyll Fluorescence (OCO-2 SIF)": "sif/oco2/Sodankyla_SIF_OCO2_final.nc4",
"Vegetation Optical Depth (AMSR-2 VOD)": "amsr2/final/AMSR2_so.nc"
}
model_forcing = {
"static-site-level": "FI-Sod_staticforcing.nc",
"time-dependent (ERA5) - site level": "FI-Sod_dynforcing-era5_20090101-20211231_with-lwdown.nc",
"time-dependent (in-situ) - site level": "FI-Sod_dynforcing-insitu_20090101-20211231_with-insitu-lwdown.nc",
"static-regional": "sodankyla-region_cgls-pft-crops-redistributed_staticforcing.nc",
"time-dependent (ERA5) - regional": "sodankyla-region_dynforcing_era5_2009-2021.nc"
}
collection = Collection.from_dict(
{
"type": "Collection",
"id": collectionid,
"stac_version": "1.1.0",
"title": "Terrestrial Carbon Community Assimilation System: Database for Lapland and Sodankyla region",
"description": "The Terrestrial Carbon Community Assimilation System (TCCAS) is built around the coupled D&B terrestrial biosphere model. D&B has been newly developed based on the well-established DALEC and BETHY models and builds on the strengths of each component model. In particular, D&B combines the dynamic simulation of the carbon pools and canopy phenology of DALEC with the dynamic simulation of water pools, and the canopy model of photosynthesis and energy balance of BETHY. D&B includes a set of observation operators for optical as well as active and passive microwave observations. The focus of TCCAS is the combination of this diverse array of observational data streams with the D&B model into a consistent picture of the terrestrial carbon, water, and energy cycles. TCCAS applies a variational assimilation approach that adjusts a combination of initial pool sizes and process parameters to match the observational data streams. This dataset includes Satelite, Field and model forcing data sets for Sodankylä and Lapland region.",
"extent": {
"spatial": {
"bbox": [
[
18.00,
65.00,
32.00,
69.00
]
]
},
"temporal": {
"interval": [
[
"2011-01-01T00:00:00Z",
"2021-12-31T00:00:00Z"
]
]
}
},
"license": "various",
"links": []
}
)
collection # visualise the metadata of your collection create_items_from_data(
collectionid,
geometry,
bbox,
collection,
root_url,
data_files,
region,
)/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:8: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.
ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# add a single item with all the in-situ data, since it comes in a single .tgz file
add_insitu_data(
collectionid,
geometry,
bbox,
data_time,
collection,
'sodankyla-insitu-package.tgz',
)2021-12-31 00:00:00+00:00
create_model_forcing_item(
collectionid,
geometry,
bbox,
data_time,
collection,
model_forcing)add_documentation_item(
collectionid,
geometry,
bbox,
data_time,
collection)# save the full self-contained collection
collection.normalize_and_save(
root_href=f'../../prr_preview/{collectionid}',
catalog_type=pystac.CatalogType.SELF_CONTAINED
)collection2.2 Define ‘Netherlands region (including Reusel)’ data and links¶
# define dataset names and base url. If the data is locally stored, then you have to adjust these paths.
# create the parent collection
root_url = 'https://lcc.inversion-lab.com/data/eo/'
region = 'netherlands'
collectionid = "tccas-netherlands"
bbox = [4.502, 47.502, 11.477, 51.999]
geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
data_time = pd.to_datetime('2021-12-31T00:00:00Z')
data_files = {
"Vegetation Optical Depth (SMOS)": "smos/smosL2/smosL2_1D_v700_reusel_trans.nc",
"Slope": "ascat/local_slope.final/ASCAT_slope_re.nc",
"Solar Induced Chlorophyll Fluorescence": "sif/tropomi/Reusel_SIF_TROPOMI_final.nc4",
"Soil moisture": "smos/smosL2/smosL2_1D_v700_reusel_trans.nc",
"Brightness temperature": "smos/smos_l3tb/SMOS_L3TB__reusel.nc",
"Vegetation Optical Depth": "amsr2/final/AMSR2_re.nc",
"Solar Induced Chlorophyll Fluorescence (OCO2)": "sif/oco2/Reusel_SIF_OCO2_final.nc4",
"Land Surface Temperature": "modis/final/LST_ESTIMATE_REUSEL_SINUSOIDAL.nc",
"Photochemical Reflectance Index": "modis/final/PRI_ESTIMATE_REUSEL_SINUSOIDAL.nc"
}
collection = Collection.from_dict(
{
"type": "Collection",
"id": collectionid,
"stac_version": "1.1.0",
"title": "Terrestrial Carbon Community Assimilation System: Database for the Netherlands region (including Reusel)",
"description": "The Terrestrial Carbon Community Assimilation System (TCCAS) is built around the coupled D&B terrestrial biosphere model. D&B has been newly developed based on the well-established DALEC and BETHY models and builds on the strengths of each component model. In particular, D&B combines the dynamic simulation of the carbon pools and canopy phenology of DALEC with the dynamic simulation of water pools, and the canopy model of photosynthesis and energy balance of BETHY. D&B includes a set of observation operators for optical as well as active and passive microwave observations. The focus of TCCAS is the combination of this diverse array of observational data streams with the D&B model into a consistent picture of the terrestrial carbon, water, and energy cycles. TCCAS applies a variational assimilation approach that adjusts a combination of initial pool sizes and process parameters to match the observational data streams. This dataset includes Satelite, Field and model forcing data sets for the Netherlands region, including Reusel.",
"extent": {
"spatial": {
"bbox": [
[4.502, 47.502, 11.477, 51.999]
]
},
"temporal": {
"interval": [
[
"2011-01-01T00:00:00Z",
"2021-12-31T00:00:00Z"
]
]
}
},
"license": "various",
"links": []
}
)
collection # visualise the metadata of your collection create_items_from_data(
collectionid,
geometry,
bbox,
collection,
root_url,
data_files,
region,
)/tmp/ipykernel_119170/4235652563.py:8: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.
ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# add a single item with all the in-situ data, since it comes in a single .tgz file
add_insitu_data(
collectionid,
geometry,
bbox,
data_time,
collection,
'reusel-insitu-package.tgz',
)2021-12-31 00:00:00+00:00
add_documentation_item(
collectionid,
geometry,
bbox,
data_time,
collection)# save the full self-contained collection
collection.normalize_and_save(
root_href=f'../../prr_preview/{collectionid}',
catalog_type=pystac.CatalogType.SELF_CONTAINED
)collection2.2 Define ‘Majadas del Tietar and Iberian region’ data and links¶
# define dataset names and base url. If the data is locally stored, then you have to adjust these paths.
# create the parent collection
root_url = 'https://lcc.inversion-lab.com/data/eo/'
region = 'iberia'
collectionid = "tccas-iberia"
bbox = [-8.496, 38.501, -2.505, 42.997]
geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
data_time = pd.to_datetime('2021-12-31T00:00:00Z')
data_files = {
"Vegetation Optical Depth (SMOS)": "smos/smosL2/smosL2_1D_v700_lasmajadas_trans.nc",
"Slope": "ascat/local_slope.final/ASCAT_slope_lm.nc",
"Solar Induced Chlorophyll Fluorescence": "sif/tropomi/Majadas_SIF_TROPOMI_final.nc4",
"Fraction of absorbed Photosynthetic Active Radiation, Leaf Area Index": "jrc-tip/jrctip_fapar-lai_majadas_20110101-20220105.nc",
"Soil moisture": "smos/smosL2/smosL2_1D_v700_lasmajadas_trans.nc",
"Brightness temperature": "smos/smos_l3tb/SMOS_L3TB__lasmajadas.nc",
"Vegetation Optical Depth": "amsr2/final/AMSR2_lm.nc",
"Land Surface Temperature": "modis/final/LST_ESTIMATE_MAJADAS_SINUSOIDAL.nc",
"Photochemical Reflectance Index": "modis/final/PRI_ESTIMATE_MAJADAS_SINUSOIDAL.nc",
"Solar Induced Chlorophyll Fluorescence (OCO2)": "sif/oco2/Majadas_SIF_OCO2_final.nc4"
}
model_forcing = {
"static": "ES-LM1_staticforcing.nc",
"time-dependent (ERA5)": "ES-LM1_dynforcing-era5_20090101-20211231_with-lwdown.nc",
"time-dependent (in-situ)": "ES-LM1_dynforcing-insitu_20140401-20220930_with-insitu-lwdown.nc",
"static-regional": "majadas-region_cgls-pft-moli2bare_staticforcing.nc",
"time-dependent (ERA5)-regional": "majadas-region_dynforcing_era5_2009-2021.nc"
}
collection = Collection.from_dict(
{
"type": "Collection",
"id": collectionid,
"stac_version": "1.1.0",
"title": "Terrestrial Carbon Community Assimilation System: Database for Iberian region (including Majadas del Tietar)",
"description": "The Terrestrial Carbon Community Assimilation System (TCCAS) is built around the coupled D&B terrestrial biosphere model. D&B has been newly developed based on the well-established DALEC and BETHY models and builds on the strengths of each component model. In particular, D&B combines the dynamic simulation of the carbon pools and canopy phenology of DALEC with the dynamic simulation of water pools, and the canopy model of photosynthesis and energy balance of BETHY. D&B includes a set of observation operators for optical as well as active and passive microwave observations. The focus of TCCAS is the combination of this diverse array of observational data streams with the D&B model into a consistent picture of the terrestrial carbon, water, and energy cycles. TCCAS applies a variational assimilation approach that adjusts a combination of initial pool sizes and process parameters to match the observational data streams. This dataset includes Satelite, Field and model forcing data sets for the Iberian region, including Majadas del Tietar.",
"extent": {
"spatial": {
"bbox": [
[-8.496, 38.501, -2.505, 42.997]
]
},
"temporal": {
"interval": [
[
"2011-01-01T00:00:00Z",
"2021-12-31T00:00:00Z"
]
]
}
},
"license": "various",
"links": []
}
)
collection # visualise the metadata of your collection create_items_from_data(
collectionid,
geometry,
bbox,
collection,
root_url,
data_files,
region,
)/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:8: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.
ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# add a single item with all the in-situ data, since it comes in a single .tgz file
add_insitu_data(
collectionid,
geometry,
bbox,
data_time,
collection,
'lasmajadas-insitu-package.tgz',
)2021-12-31 00:00:00+00:00
create_model_forcing_item(
collectionid,
geometry,
bbox,
data_time,
collection,
model_forcing)add_documentation_item(
collectionid,
geometry,
bbox,
data_time,
collection)# save the full self-contained collection
collection.normalize_and_save(
root_href=f'../../prr_preview/{collectionid}',
catalog_type=pystac.CatalogType.SELF_CONTAINED
)collectionAcknowledgments¶
We gratefully acknowledge the Terrestrial Carbon Community Assimilation System (TCCAS) team for providing access to the data used in this example, as well as support in creating it.