Skip to article frontmatterSkip to article content

Generating STAC metadatas for the TCCAS dataset

This notebook shows how to generate a valid STAC collection, which is a requirement to upload research outcomes to the ESA Project Results Repository (PRR). It focuses on generating metadata for a project with multiple data files of different types.

Check the EarthCODE documentation, and PRR STAC introduction example for a more general introduction to STAC and the ESA PRR.

The code below demonstrates how to perform the necessary steps using real data from the ESA project Terrestrial Carbon Community Assimilation System (TCCAS). The focus of TCCAS is the combination of a diverse array of observational data streams with the D&B terrestrial biosphere model into a consistent picture of the terrestrial carbon, water, and energy cycles.

🔗 Check the project website: Terrestrial Carbon Community Assimilation System (TCCAS) – Website

🛢️ TCCAS Dataset: Terrestrial Carbon Community Assimilation System (TCCAS) – Data base: Sodankylä and Lapland region

# import libraries
import xarray as xr
from pystac import Item, Collection
import pystac
from datetime import datetime
from shapely.geometry import box, mapping
from xstac import xarray_to_stac
import glob
import json
import shapely
import numpy as np
import pandas as pd

2. Define functions to be reused between regions

def add_insitu_data(collectionid, geometry, bbox, start_time, collection, insitu_href):
    print(start_time)
    item = Item(
        id=f"{collectionid}-insitu_package",
        geometry=geometry,
        datetime=start_time,
        bbox=bbox,
        properties= {
            "license": "CC-BY-4.0",
            "description": 'Insitu package with FloX, VOD and Miscellaneous field datasets related to the TCCAS project. ',
        }
    )

    # 3. add an asset (the actual link to the file)
    item.add_asset(
                key=f'Insitu package', # title can be arbitrary
                asset=pystac.Asset(
                    href=f'./{collectionid}/{insitu_href}',
                    media_type="application/tar+gzip",
                    roles=["data"],
                )
    )

    item.validate()
    collection.add_item(item)
def add_documentation_item(collectionid, geometry, bbox, start_time, collection, ):
    # add all the documentation under a single item
    item = Item(
        id=f"{collectionid}-documentation",
        geometry=geometry,
        datetime=start_time,
        bbox=bbox,
        properties= {
            "license": "CC-BY-4.0",
            "description": 'Documentation for the TCCAS project datasets.',
        }
    )

    item.add_asset(
                key=f'TCCAS user manual.', # title can be arbitrary
                asset=pystac.Asset(
                    href=f'./{collectionid}/TCCAS_manual.pdf',
                    media_type="application/pdf",
                    roles=["documentation"],
                )
    )

    item.add_asset(
                key="Satellite Data Uncertainty analysis Scientific Report", # title can be arbitrary
                asset=pystac.Asset(
                    href=f'./{collectionid}/D7.pdf',
                    media_type="application/pdf",
                    roles=["documentation"],
                )
    )

    item.add_asset(
                key="Campaign Data User Manual", # title can be arbitrary
                asset=pystac.Asset(
                    href=f'./{collectionid}/D11_CDUM-all_sites.pdf',
                    media_type="application/pdf",
                    roles=["documentation"],
                )
    )

    collection.add_item(item)
# add an item with multiple model forcing assets

def create_model_forcing_item(collectionid, geometry, bbox, start_time, collection, model_forcing):

    item = Item(
        id=f"{collectionid}-model_forcing",
        geometry=geometry,
        datetime=start_time,
        bbox=bbox,
        properties= {
            "license": "CC-BY-4.0",
            "description": ' Regional and Site-level model forcing Data Sets for Sodankylä and Lapland region, part of the TCCAS project.',
        }
    )

    for k,v in model_forcing.items():
        item.add_asset(
                    key=k, # title can be arbitrary
                    asset=pystac.Asset(
                        href=f'./{collectionid}/{v}',
                        media_type="application/x-netcdf",
                        roles=["data"],
                    )
        )

    item.validate()
    collection.add_item(item)
# some attributes extracted from xarray are not json serialisable and have to be cast to other types.
def convert_to_json_serialisable(attrs):
    attrs = attrs.copy()
    for attr in attrs.keys():
        if isinstance(attrs[attr], np.ndarray):
            attrs[attr] = attrs[attr].tolist()
        elif str(type(attrs[attr])).__contains__('numpy.int'):
            attrs[attr] = int(attrs[attr])
    return attrs
def create_items_from_data(collectionid, geometry, bbox, collection, root_url, data_files, region):

    for dataset_name, dataset_filepath in data_files.items():

        # 1. open the netcdf file
        ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')

        if 'time' in ds.coords:
            start_time = ds['time'][0].values
            ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
            start_time = datetime.fromtimestamp(ts)

            end_time = ds['time'][-1].values
            ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
            end_time = datetime.fromtimestamp(ts)

        elif 'yymmddHH' in ds.variables:
            string_date = '-'.join(ds['yymmddHH'][0].values.astype(str)[:3])
            start_time = datetime.strptime(string_date, '%Y-%m-%d')

            string_date = '-'.join(ds['yymmddHH'][-1].values.astype(str)[:3])
            end_time = datetime.strptime(string_date, '%Y-%m-%d')
        else:
            string_date = '-'.join(ds['yymmddHHMMSS'][0].values.astype(int).astype(str)[:3])
            start_time = datetime.strptime(string_date, '%Y-%m-%d')

            string_date = '-'.join(ds['yymmddHHMMSS'][0].values.astype(int).astype(str)[:3])
            end_time = datetime.strptime(string_date, '%Y-%m-%d')

        if 'description' in ds.attrs:
            description = ds.attrs['description']
        else:
            description = f'Dataset with variables related to {dataset_name}.'
            
        template = {

            "id": f"{collection.id}-{dataset_name.lower().replace(' ', '_')}",
            "type": "Feature",
            "stac_version": "1.0.0",
            "properties": {
                "title": dataset_name,
                "description": description,
                "start_datetime": start_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "end_datetime": end_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "region": region,
            },
            "geometry": geometry,
            "bbox": bbox,
            "assets": {
                "data": {
                    "href": f"./{collectionid}/{dataset_filepath.split('/')[-1]}",  # or local path
                    "type": "application/x-netcdf",
                    "roles": ["data"],
                    "title": dataset_name
                }
            }
        }

        # remove numpy values from attrs:
        for var in ds.variables:
            ds[var].attrs = convert_to_json_serialisable(ds[var].attrs)

        if 'lon' in ds.coords:
            x_dim, y_dim = 'lon', 'lat'
        if 'longitude' in ds.coords:
            x_dim, y_dim ='longitude', 'latitude'
        elif 'x' in ds.coords:
            x_dim, y_dim = 'x', 'y'
        else:
            x_dim, y_dim = False, False
        
        # 3. Generate the STAC Item
        item = xarray_to_stac(
            ds,
            template,
            temporal_dimension="time" if 'time' in ds.variables else False,
            x_dimension=x_dim,
            y_dimension=y_dim,  
            reference_system=False
        )

        # validate and add the STAC Item to the collection
        item.validate()

        # manually add license after validation since its new
        item.properties['license'] = ds.attrs['license']
        collection.add_item(item)
    
# define dataset names and base url. If the data is locally stored, then you have to adjust these paths.

# create the parent collection
root_url = 'https://lcc.inversion-lab.com/data/eo/'
region = 'sodankylae'
collectionid = "tccas-sodankylae"
bbox = [18.00, 65.00, 32.00, 69.00]
geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
data_time = pd.to_datetime('2021-12-31T00:00:00Z')

data_files = {
    "Fraction of absorbed Photosynthetic Active Radiation Leaf Area Index (JRC-TIP)": "/jrc-tip/jrctip_fapar-lai_sodankyla_20110101-20220105.nc",
    "Brightness temperature (SMOS TB)": "smos/smos_l3tb/SMOS_L3TB__sodankyla.nc",
    "Soil moisture and Vegetation Optical Depth (SMOS SM and SMOS L-VOD)": "smos/smosL2/smosL2_1D_v700_sodankyla_trans.nc",
    "Solar Induced Chlorophyll Fluorescence (Sentinel 5P)": "sif/tropomi/Sodankyla_SIF_TROPOMI_final.nc4",
    "Slope (ASCAT Slope)": "ascat/local_slope.final/ASCAT_slope_so.nc",
    "Photochemical Reflectance Index (MODIS PRI)": "modis/final/PRI_ESTIMATE_SODANKYLA_SINUSOIDAL.nc",
    "Land Surface Temperature (MODIS LST)": "modis/final/LST_ESTIMATE_SODANKYLA_SINUSOIDAL.nc",
    "Solar Induced Chlorophyll Fluorescence (OCO-2 SIF)": "sif/oco2/Sodankyla_SIF_OCO2_final.nc4",
    "Vegetation Optical Depth (AMSR-2 VOD)": "amsr2/final/AMSR2_so.nc"
} 

model_forcing = {
    "static-site-level": "FI-Sod_staticforcing.nc",
    "time-dependent (ERA5) - site level": "FI-Sod_dynforcing-era5_20090101-20211231_with-lwdown.nc",
    "time-dependent (in-situ) - site level": "FI-Sod_dynforcing-insitu_20090101-20211231_with-insitu-lwdown.nc",
    "static-regional": "sodankyla-region_cgls-pft-crops-redistributed_staticforcing.nc",
    "time-dependent (ERA5) - regional": "sodankyla-region_dynforcing_era5_2009-2021.nc"
}





collection = Collection.from_dict(
    
{
  "type": "Collection",
  "id": collectionid,
  "stac_version": "1.1.0",
  "title": "Terrestrial Carbon Community Assimilation System: Database for Lapland and Sodankyla region",
  "description": "The Terrestrial Carbon Community Assimilation System (TCCAS) is built around the coupled D&B terrestrial biosphere model. D&B has been newly developed based on the well-established DALEC and BETHY models and builds on the strengths of each component model. In particular, D&B combines the dynamic simulation of the carbon pools and canopy phenology of DALEC with the dynamic simulation of water pools, and the canopy model of photosynthesis and energy balance of BETHY. D&B includes a set of observation operators for optical as well as active and passive microwave observations. The focus of TCCAS is the combination of this diverse array of observational data streams with the D&B model into a consistent picture of the terrestrial carbon, water, and energy cycles. TCCAS applies a variational assimilation approach that adjusts a combination of initial pool sizes and process parameters to match the observational data streams. This dataset includes Satelite, Field and model forcing data sets for Sodankylä and Lapland region.",
  "extent": {
    "spatial": {
      "bbox": [
        [
          18.00,
          65.00,
          32.00,
          69.00
        ]
      ]
    },
    "temporal": {
      "interval": [
        [
          "2011-01-01T00:00:00Z",
          "2021-12-31T00:00:00Z"
        ]
      ]
    }
  },
  "license": "various",
  "links": []

}

)

collection # visualise the metadata of your collection 
Loading...
create_items_from_data(
    collectionid,
    geometry,
    bbox,
    collection,
    root_url,
    data_files,
    region,
)
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:8: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.
  ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# add a single item with all the in-situ data, since it comes in a single .tgz file

add_insitu_data(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection,
    'sodankyla-insitu-package.tgz',
)
2021-12-31 00:00:00+00:00
create_model_forcing_item(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection,
    model_forcing)
add_documentation_item(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection)
# save the full self-contained collection
collection.normalize_and_save(
    root_href=f'../../prr_preview/{collectionid}',
    catalog_type=pystac.CatalogType.SELF_CONTAINED
)
collection
Loading...
# define dataset names and base url. If the data is locally stored, then you have to adjust these paths.

# create the parent collection
root_url = 'https://lcc.inversion-lab.com/data/eo/'
region = 'netherlands'
collectionid = "tccas-netherlands"
bbox = [4.502, 47.502, 11.477, 51.999]
geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
data_time = pd.to_datetime('2021-12-31T00:00:00Z')

data_files = {
    "Vegetation Optical Depth (SMOS)": "smos/smosL2/smosL2_1D_v700_reusel_trans.nc",
    "Slope": "ascat/local_slope.final/ASCAT_slope_re.nc",
    "Solar Induced Chlorophyll Fluorescence": "sif/tropomi/Reusel_SIF_TROPOMI_final.nc4",
    "Soil moisture": "smos/smosL2/smosL2_1D_v700_reusel_trans.nc",
    "Brightness temperature": "smos/smos_l3tb/SMOS_L3TB__reusel.nc",
    "Vegetation Optical Depth": "amsr2/final/AMSR2_re.nc",
    "Solar Induced Chlorophyll Fluorescence (OCO2)": "sif/oco2/Reusel_SIF_OCO2_final.nc4",
    "Land Surface Temperature": "modis/final/LST_ESTIMATE_REUSEL_SINUSOIDAL.nc",
    "Photochemical Reflectance Index": "modis/final/PRI_ESTIMATE_REUSEL_SINUSOIDAL.nc"
}


collection = Collection.from_dict(
    
{
  "type": "Collection",
  "id": collectionid,
  "stac_version": "1.1.0",
  "title": "Terrestrial Carbon Community Assimilation System: Database for the Netherlands region (including Reusel)",
  "description": "The Terrestrial Carbon Community Assimilation System (TCCAS) is built around the coupled D&B terrestrial biosphere model. D&B has been newly developed based on the well-established DALEC and BETHY models and builds on the strengths of each component model. In particular, D&B combines the dynamic simulation of the carbon pools and canopy phenology of DALEC with the dynamic simulation of water pools, and the canopy model of photosynthesis and energy balance of BETHY. D&B includes a set of observation operators for optical as well as active and passive microwave observations. The focus of TCCAS is the combination of this diverse array of observational data streams with the D&B model into a consistent picture of the terrestrial carbon, water, and energy cycles. TCCAS applies a variational assimilation approach that adjusts a combination of initial pool sizes and process parameters to match the observational data streams. This dataset includes Satelite, Field and model forcing data sets for the Netherlands region, including Reusel.",
  "extent": {
    "spatial": {
      "bbox": [
         [4.502, 47.502, 11.477, 51.999]
      ]
    },
    "temporal": {
      "interval": [
        [
          "2011-01-01T00:00:00Z",
          "2021-12-31T00:00:00Z"
        ]
      ]
    }
  },
  "license": "various",
  "links": []

}

)

collection # visualise the metadata of your collection 
Loading...
create_items_from_data(
    collectionid,
    geometry,
    bbox,
    collection,
    root_url,
    data_files,
    region,
)
/tmp/ipykernel_119170/4235652563.py:8: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.
  ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# add a single item with all the in-situ data, since it comes in a single .tgz file

add_insitu_data(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection,
    'reusel-insitu-package.tgz',
)
2021-12-31 00:00:00+00:00
add_documentation_item(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection)
# save the full self-contained collection
collection.normalize_and_save(
    root_href=f'../../prr_preview/{collectionid}',
    catalog_type=pystac.CatalogType.SELF_CONTAINED
)
collection
Loading...
# define dataset names and base url. If the data is locally stored, then you have to adjust these paths.


# create the parent collection
root_url = 'https://lcc.inversion-lab.com/data/eo/'
region = 'iberia'
collectionid = "tccas-iberia"
bbox = [-8.496, 38.501, -2.505, 42.997]
geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
data_time = pd.to_datetime('2021-12-31T00:00:00Z')

data_files = {
    "Vegetation Optical Depth (SMOS)": "smos/smosL2/smosL2_1D_v700_lasmajadas_trans.nc",
    "Slope": "ascat/local_slope.final/ASCAT_slope_lm.nc",
    "Solar Induced Chlorophyll Fluorescence": "sif/tropomi/Majadas_SIF_TROPOMI_final.nc4",
    "Fraction of absorbed Photosynthetic Active Radiation, Leaf Area Index": "jrc-tip/jrctip_fapar-lai_majadas_20110101-20220105.nc",
    "Soil moisture": "smos/smosL2/smosL2_1D_v700_lasmajadas_trans.nc",
    "Brightness temperature": "smos/smos_l3tb/SMOS_L3TB__lasmajadas.nc",
    "Vegetation Optical Depth": "amsr2/final/AMSR2_lm.nc",
    "Land Surface Temperature": "modis/final/LST_ESTIMATE_MAJADAS_SINUSOIDAL.nc",
    "Photochemical Reflectance Index": "modis/final/PRI_ESTIMATE_MAJADAS_SINUSOIDAL.nc",
    "Solar Induced Chlorophyll Fluorescence (OCO2)": "sif/oco2/Majadas_SIF_OCO2_final.nc4"
}

model_forcing = {
    "static": "ES-LM1_staticforcing.nc",
    "time-dependent (ERA5)": "ES-LM1_dynforcing-era5_20090101-20211231_with-lwdown.nc",
    "time-dependent (in-situ)": "ES-LM1_dynforcing-insitu_20140401-20220930_with-insitu-lwdown.nc",
    "static-regional": "majadas-region_cgls-pft-moli2bare_staticforcing.nc",
    "time-dependent (ERA5)-regional": "majadas-region_dynforcing_era5_2009-2021.nc"
}



collection = Collection.from_dict(
    
{
  "type": "Collection",
  "id": collectionid,
  "stac_version": "1.1.0",
  "title": "Terrestrial Carbon Community Assimilation System: Database for Iberian region (including Majadas del Tietar)",
  "description": "The Terrestrial Carbon Community Assimilation System (TCCAS) is built around the coupled D&B terrestrial biosphere model. D&B has been newly developed based on the well-established DALEC and BETHY models and builds on the strengths of each component model. In particular, D&B combines the dynamic simulation of the carbon pools and canopy phenology of DALEC with the dynamic simulation of water pools, and the canopy model of photosynthesis and energy balance of BETHY. D&B includes a set of observation operators for optical as well as active and passive microwave observations. The focus of TCCAS is the combination of this diverse array of observational data streams with the D&B model into a consistent picture of the terrestrial carbon, water, and energy cycles. TCCAS applies a variational assimilation approach that adjusts a combination of initial pool sizes and process parameters to match the observational data streams. This dataset includes Satelite, Field and model forcing data sets for the Iberian region, including Majadas del Tietar.",
  "extent": {
    "spatial": {
      "bbox": [
        [-8.496, 38.501, -2.505, 42.997]
      ]
    },
    "temporal": {
      "interval": [
        [
          "2011-01-01T00:00:00Z",
          "2021-12-31T00:00:00Z"
        ]
      ]
    }
  },
  "license": "various",
  "links": []

}

)

collection # visualise the metadata of your collection 
Loading...
create_items_from_data(
    collectionid,
    geometry,
    bbox,
    collection,
    root_url,
    data_files,
    region,
)
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:8: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.
  ds = xr.open_dataset(root_url + dataset_filepath + '#mode=bytes')
/tmp/ipykernel_119170/4235652563.py:12: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (start_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
/tmp/ipykernel_119170/4235652563.py:16: UserWarning: no explicit representation of timezones available for np.datetime64
  ts = (end_time - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# add a single item with all the in-situ data, since it comes in a single .tgz file

add_insitu_data(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection,
    'lasmajadas-insitu-package.tgz',
)
2021-12-31 00:00:00+00:00
create_model_forcing_item(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection,
    model_forcing)
add_documentation_item(
    collectionid,
    geometry,
    bbox,
    data_time,
    collection)
# save the full self-contained collection
collection.normalize_and_save(
    root_href=f'../../prr_preview/{collectionid}',
    catalog_type=pystac.CatalogType.SELF_CONTAINED
)
collection
Loading...

Acknowledgments

We gratefully acknowledge the Terrestrial Carbon Community Assimilation System (TCCAS) team for providing access to the data used in this example, as well as support in creating it.