WAPOSAL - Wave power density, zero-crossing wave period, and significant wave height product

This notebook shows how to

Combine a number of .nc files into a .zarr
Generate a valid STAC collection, which is a requirement to upload research outcomes to the ESA Project Results Repository (PRR).

The code below demonstrates how to perform the necessary steps using real data from the ESA project WAPOSAL. The focus of WAPOSAL is to analyse wave power density, zero-crossing wave period, and significant wave height product obtained along track based on Sentinel-3 A/B and Cryosat-2 altimeter data processed by SAMOSA+ retracker.

Check the EarthCODE documentation, and PRR STAC introduction example for a more general introduction to STAC and the ESA PRR.

🔗 Check the project website: WAPOSAL – Website

🔗 Check the eo4society page: WAPOSAL – eo4society

Combine the multiple trajectory files into zarr stores.¶

from pathlib import Path
import numpy as np
import xarray as xr
import re
import os
from joblib import Parallel, delayed
import pickle

# define region information
bbox_ = {}
bbox_['UK'] = [-11.0,47.5,10.0,60.0]
bbox_['BN'] = [3.0,53.0,30.35,72.0]
bbox_['FF'] = [-10.0,43.0,-0.5,49.0]
bbox_['NS'] = [-10.0,42.0,-1.0,45.0]
bbox_['FP'] = [-155.0,-27.0,-134.0,-7.0]
bbox_['MT'] = [-6.0,30.1,41.8,47.4]
bbox_['FG'] = [-54.0,4.1,-51.5,6.0]
bbox_['CN'] = [-19.0,26.0,-10.0,31.0]
bbox_['PT'] = [-10.0,37.0,-8.0,42.0]
bbox_['MD'] = [-18.5,27.0,-13.0,30.0]
bbox_['AZ'] = [-32.0,36.5,-24.0,40.0]

satellite_names = {'S3A': 'Sentinel-3', 'S3B': 'Sentinel-3','CS2': 'CryoSat-2'}
platform_id = {'S3A': 'A', 'S3B': 'B','CS2': ''}
sensor = {'S3A': 'SRAL', 'S3B': 'SRAL','CS2': 'SIRAL'}
zone = {'AZ': 'Azores', 
        'FG': 'French Guiana', 
        'FF': 'French Facade', 
        'UK': 'UK', 
        'FP': 'French Polinesia',
        'CN': 'Canaries',
        'PT': 'Portugal',
        'NS': 'North Spain',
        'BN': 'Baltic and Norway',
        'MT': 'Mediterranean',
        'MD': 'Madeira'}

epoch_2000 = np.datetime64('2000-01-01T00:00:00')

def return_temporal_size(f):
    return xr.open_dataset(f, decode_timedelta=False).time.shape[0]


def combine_to_trajectory_zarr(region, satellite, files):

    # # define the coordinates shapes
    # sizes = Parallel(n_jobs=-1)(delayed(return_temporal_size)(f) for f in files)
    # max_obs = max(sizes)
    # trajectories_len = len(files)
    # obs_len = max_obs

    # read coordinate shapes
    trajectories_len, obs_len = dimensions[f'{region}-{satellite}']
    join_strategy = 'exact' if obs_len is not None else 'outer'
    
    datasets_list = []
    
    for filepath in files:
    
        # open file and drop dim
        ds = xr.open_dataset(filepath, decode_times=False)
        ds = ds.sel(dim=0)
        
        # rename the 'time' dimension to 'obs'
        ds = ds.rename_dims({'time': 'obs'})
        
        # pad to maximum observations
        ds = ds.pad(obs=(0, obs_len - ds.obs.size), mode='constant', constant_values=np.nan)
    
        # expand trakectory dimension
        ds = ds.expand_dims('trajectory')
        
        # Create the 'trajectory_info' variable
        filename = os.path.basename(filepath)
        ds['trajectory_info'] = xr.DataArray(
            [filename],  # Data is the filename
            dims=['trajectory'],  # Depends on the new dimension
            attrs={'long_name': 'trajectory info'} # Add attributes
        )
    
        # add to concat list
        datasets_list.append(ds)

    # combine the datasets
    combined_ds = xr.concat(
        datasets_list, 
        dim='trajectory', 
        join=join_strategy, 
        fill_value=np.nan
    )

    del datasets_list
    
    # # update metadata
    combined_ds = combined_ds.chunk(trajectory=1000)
    start_time = epoch_2000 + np.timedelta64(int(combined_ds.TAI_Time_20Hz.isel(trajectory=0).min(skipna=True).values), 's')
    end_time = epoch_2000 + np.timedelta64(int(combined_ds.TAI_Time_20Hz.isel(trajectory=-1).max(skipna=True).values), 's')
    
    combined_ds.attrs.update({
        "Conventions" : "CF-1.6/CF-1.7",
        "feature_type" :"trajectory",
        "start_datetime": str(start_time),
        "end_datetime": str(end_time),
        "created": '2025-11-14T00:00:00Z',        
        "description": f"Wave power density, zero-crossing wave period, and significant wave height product obtained along-track from {satellite_names[satellite]} altimeter data processed by SAMOSA+ retracker over the {zone[region]} region.",
        "zone": zone[region], # region for example Azores        
        "platform_name": satellite,
        "institution": 'CENTEC-IST-ID',
        "license": "CC-BY-SA-4.0",
        "min_lon": bbox_[region][0],
        "min_lat": bbox_[region][1],
        "max_lon": bbox_[region][2],
        "max_lat": bbox_[region][3],
        "crs": 'epsg:4326',
        "referencfe": "https://opensciencedata.esa.int/products/waposal-waves/collection"
    })

    return combined_ds

rootpath = "/mnt/d/data/waposal/extracted/"
regions = ['AZ', 'BN', 'CN', 'FF', 'FG', 'FP', 'MD', 'MT', 'NS', 'PT', 'UK']
satellites = ['S3A', 'S3B', 'CS2']

# define the dimensions of each zarr store.
# We need to know this ahead of time, to align all the observations
import gc
from joblib import Parallel, delayed
def return_temporal_size(f):
    return xr.open_dataset(f, decode_timedelta=False).time.shape[0]

dimensions = {}

for region in regions[3:]:
    for satellite in satellites:
        print(region + '-' + satellite)
        files = sorted(Path(f'{rootpath}/{region}/{region}/{satellite}/').glob("./*/**/*.nc"))
        if len(files) == 0: continue
        sizes = Parallel(n_jobs=-1)(delayed(return_temporal_size)(f) for f in files)
        obs_len = max(sizes)
        trajectories_len = len(files)
        dimensions[f'{region}-{satellite}'] = (trajectories_len, obs_len)
        del sizes
        gc.collect()

with open('dimensions.pkl', 'wb') as f:
    pickle.dump(dimensions, f)

with open('dimensions.pkl', 'rb') as f:
    dimensions = pickle.load(f)

dimensions

{'AZ-S3A': (1711, 1218),
 'AZ-S3B': (1019, 1227),
 'AZ-CS2': (1554, 573),
 'BN-S3A': (8719, 6882),
 'BN-S3B': (5302, 6927),
 'BN-CS2': (8632, 5309),
 'CN-S3A': (1998, 1735),
 'CN-S3B': (1259, 1750),
 'FF-S3A': (2274, 2095),
 'FF-S3B': (1608, 2112),
 'FF-CS2': (4533, 2245),
 'FG-S3A': (527, 658),
 'FG-S3B': (327, 658),
 'FP-S3A': (4864, 6885),
 'FP-S3B': (3046, 6956),
 'FP-CS2': (1668, 7019),
 'MD-S3A': (1248, 1041),
 'MD-S3B': (778, 1050),
 'MD-CS2': (4406, 2929),
 'MT-S3A': (10339, 6263),
 'MT-S3B': (6344, 6072),
 'MT-CS2': (21047, 5495),
 'NS-S3A': (1867, 1046),
 'NS-S3B': (1210, 1061),
 'NS-CS2': (3453, 1123),
 'PT-S3A': (625, 1740),
 'PT-S3B': (445, 1740),
 'PT-CS2': (4208, 3932),
 'UK-S3A': (5322, 4488),
 'UK-S3B': (3273, 4434),
 'UK-CS2': (9520, 4671)}

import gc
for region in regions:
    for satellite in satellites:
        print(region + '-' + satellite)
        files = sorted(Path(f'{rootpath}/{region}/{region}/{satellite}/').glob("./*/**/*.nc"))
        if len(files) == 0:
            continue
        ds = combine_to_trajectory_zarr(region, satellite, files)
        ds.to_zarr(f'{rootpath}combined/{region}-{satellite}.zarr', mode='w', write_empty_chunks=False, zarr_format=2)
        del ds
        gc.collect()

## assert values are not changed between the originals and zarr store
for i, f in zip(np.arange(len(files)), files):

    true_values = xr.open_dataset(f, decode_times=False).Sigma0_20Hz.values.flatten()
    
    # check that all values are the same
    assert np.isclose(
        true_values, 
        ds.sel(trajectory=i).Sigma0_20Hz.values.flatten()[:true_values.shape[0]], 
        equal_nan=True
    ).all()
    
    # check that all padding is NA
    assert np.isnan(ds.sel(trajectory=i).Sigma0_20Hz.values.flatten()[true_values.shape[0]:]).all()

2. Generate STAC collection for the new files¶

from pystac import Collection
import pystac
import xarray as xr
import shapely
import json
from datetime import datetime
from xstac import xarray_to_stac
from xstac._xstac import build_horizontal_dimension

# define collection id, since it will be reused
collectionid = "waposal"

# create the root collection using pystac.Collection

collection = Collection.from_dict(
    
{
  "type": "Collection",
  "id": "waposal",
  "stac_version": "1.1.0",
  "description": "Wave power density, zero-crossing wave period, and significant wave height product obtained along track based on Sentinel-3 A/B and Cryosat-2 altimeter data processed by SAMOSA+ retracker. The products cover the Atlantic coast of Europe, Madeira, the French Polynesian archipelagos, the Azores, the Canary Islands, the Mediterranean and Baltic Seas, and the coastal zone of French Guiana.",
  "links": [],
  "stac_extensions": [
    "https://stac-extensions.github.io/osc/v1.0.0/schema.json",
    "https://stac-extensions.github.io/themes/v1.0.0/schema.json",
    "https://stac-extensions.github.io/cf/v0.2.0/schema.json"
  ],
  "osc:project": "waposal",
  "osc:status": "completed",
  "osc:region": "severeal",
  "osc:type": "product",
  "created": "2025-10-29T11:40:41Z",
  "updated": "2025-10-29T11:40:41Z",
  "sci:doi": "https://doi.org/10.57780/s3d-83ad619",
  "cf:parameter": [
    {
      "name": "wave-period"
    },
    {
      "name": "wave-height"
    }
  ],
  "themes": [
    {
      "scheme": "https://github.com/stac-extensions/osc#theme",
      "concepts": [
        {
          "id": "oceans"
        }
      ]
    }
  ],
  "osc:missions": [
    "sentinel-3",
    "cryosat-2"
  ],
  "osc:variables": [
    "wave-period",
    "wave-height"
  ],
  "title": "High-resolution WAPOSAL wave period and wave power density from Sentinel-3 A/B, CryoSat-2, and SAMOSA+ retracker",
  "extent": {
    "spatial": {
      "bbox": [
        [
          -155,
          -27,
          41.8,
          72
        ],
        [
          -32,
          36.499999999999986,
          -24,
          40
        ],
        [
          -10,
          42.00000000000003,
          -1,
          45
        ],
        [
          -10,
          43,
          -0.5,
          49
        ],
        [
          3,
          53,
          30.35,
          72
        ],
        [
          -11,
          47.5,
          10,
          60.00000000000003
        ],
        [
          -155,
          -27,
          -134,
          -6.999999999999986
        ],
        [
          -54,
          4.1000000000000085,
          -51.5,
          6
        ],
        [
          -19,
          26,
          -10,
          31
        ],
        [
          -6,
          30.09999999999998,
          41.79999999999999,
          47.400000000000006
        ],
        [
          -19,
          27,
          -5,
          39
        ],
        [
          -13.000000000000002,
          34,
          -5.5,
          44.5
        ]
      ]
    },
    "temporal": {
      "interval": [
        [
          "2011-01-01T00:00:00Z",
          "2023-12-31T23:59:59Z"
        ]
      ]
    }
  },
  "license": "CC-BY-SA-4.0",
  "keywords": [
    "altimeter data",
    "wave energy",
    "wave power",
    "wave frequency",
    "wave"
  ]
}

)

collection

files = sorted(Path(f'./combined').glob("*.zarr"))

for f in files:
    ds = xr.open_zarr(f,decode_times=False)

    # Describe the first file following the datacube stac extension standards.
    # All data is extracted from the metadata / data already present in the file we only specify
    # the template and what information is extracted

    bbox = [ds.attrs['min_lon'], ds.attrs['min_lat'], ds.attrs['max_lon'],ds.attrs['max_lat']]
    geometry = json.loads(json.dumps(shapely.box(*bbox).__geo_interface__))
    
    template = {
    
        "id": f"{collectionid}-{f.stem.lower()}",
        "type": "Feature",
        "stac_version": "1.1.0",
        "description": ds.attrs['description'],
        "title": f.stem,
        "properties": {
                "feature_type" : "trajectory",
                "start_datetime": ds.attrs['start_datetime'] + "Z", 
                "end_datetime": ds.attrs['end_datetime'] + "Z",
                "zone" : ds.attrs['zone'],
                "platform_name" :  ds.attrs['platform_name'],
                "institution" :  ds.attrs['institution'],
                "license" :  ds.attrs['license'],
                "reference": ds.attrs['referencfe']
        },
        "geometry": geometry,
        "bbox": bbox,
        "assets": {
            "data": {
                "href": f"https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/{f.stem}.zarr",  # or local path
                "type": "application/vnd+zarr",
                "roles": ["data"],
                "title": f.stem
            }
        }
    }
    
    # 3. Generate the STAC Item
    item = xarray_to_stac(
        ds,
        template,
        temporal_dimension=False,
        x_dimension=False,
        y_dimension=False,  

        reference_system=False
    )

    # add the multiple dimensions
    dims = item.properties['cube:dimensions']
    for d in list(ds.dims):
        r = build_horizontal_dimension(ds, d, None, None, None, None, False).to_dict()
        r['type'] = "trajectory" if d == 'trajectory' else "observation"
        for k, v in r.copy().items():
            if v is None:
                del r[k]
        dims[d] = r

    item.validate()

    collection.add_item(item)

collection.normalize_and_save(
    root_href=f'./{collectionid}',
    catalog_type=pystac.CatalogType.SELF_CONTAINED
)

Acknowledgments¶

We gratefully acknowledge the WAPOSAL team for providing access to the data used in this example, as well as their support in creating it.