Skip to article frontmatterSkip to article content

Generating OSC information using pystac

This notebook shows how to generate OSC Projects, Products and Workflows using pystac. EarthCODE provides a GUI editor that offers this and more functionality, including a user interface. However, if you decide to manually create items, using a library like pystac can save some time. The code described here does not carry out all the required steps to pass the automated OSC validation. For example, you still have to generate all return links as described in the manual PR tutorial. You’ll also have to manually open the PR in the end.

NOTE: Before you run the notebook you’ll need a fork of the open-science-catalog-metadata repository. See the Manual PR Tutorial about how to do it.

Import libraries

import pystac
from datetime import datetime
from pystac.extensions.projection import ProjectionExtension

Get all entries from the Open Science Catalog

# read the catalog root
catalog = pystac.Catalog.from_file('../../open-science-catalog-metadata/catalog.json')

# access the list of the themes in open science catalog
themes = catalog.get_child('themes')
allowed_themes = [child.id for child in themes.get_children()]


# access the list of available ESA missions
missions = catalog.get_child('eo-missions')
allowed_missions = [child.id for child in missions.get_children()]

# access the list of avaiable variables
variables = catalog.get_child('variables')
allowed_variables = [child.id for child in variables.get_children()]

# access the list of existing projects, products and workflows
products = catalog.get_child('products')
projects = catalog.get_child('projects')
workflows = catalog.get_child('workflows')

Define helper functions | Add new variables, theme and eo missions

def add_product_variables(collection, variables_to_add):
    '''Add variables to the collection custom fields and add links to the missions collection.'''
    
    for variable in variables_to_add:
        
        assert variable in allowed_variables

        # add the correct link
        collection.add_link(
            pystac.Link(rel="related", 
                        target=variables.get_child(variable).get_links('self')[0].href, 
                        media_type="application/json",
                        title=f"Variable: {variables.get_child(variable).title}")
        )

    # Add themes to the custom fields
    collection.extra_fields.update({
        "osc:variables": variables_to_add
    })

def add_themes(collection, themes_to_add):
    '''Add themes to the collection custom fields and add links to the themes collection.'''
    
    themes_list = []
    for theme in themes_to_add:
        
        assert theme in allowed_themes

        # add the correct link
        collection.add_link(
            pystac.Link(rel="related", 
                        target=themes.get_child(theme).get_links('self')[0].href, 
                        media_type="application/json",
                        title=f"Theme: {themes.get_child(theme).title}")
        )
        
        themes_list.append(
            {
                "scheme": "https://github.com/stac-extensions/osc#theme",
                "concepts": [{"id": theme}]
            }
        )

    # Add themes to the custom fields
    collection.extra_fields.update({
        "themes": themes_list
    }
    )


def add_links(collection, relations, targets, titles):

    '''Add links from the collection to outside websites.'''
    links = []
    
    for rel, target, title in zip(relations, targets, titles):
        links.append(pystac.Link(rel=rel, target=target, title=title)),
    
    collection.add_links(links)


def create_contract(name, roles, emails):
    '''Create a contact template'''
    contact =  {
        "name": name,
        "roles": [r for r in roles]
    }
    if emails:
        contact['emails'] = [{"value":email} for email in emails]
    return contact

def add_product_missions(collection, missions_to_add):
    '''Add missions to the collection custom fields and add links to the missions collection.'''
    
    for mission in missions_to_add:
        
        assert mission in allowed_missions

        # add the correct link
        collection.add_link(
            pystac.Link(rel="related", 
                        target=missions.get_child(mission).get_links('self')[0].href, 
                        media_type="application/json",
                        title=f"EO Mission: {missions.get_child(mission).title}"
            )
        )

    # Add themes to the custom fields
    collection.extra_fields.update({
         "osc:missions": missions_to_add
    }
    )

Define helper functions | Create new project collection


def create_project_collection(project_id, project_title, project_description, 
                      project_status, extent, project_license):

    '''Create project collection template from the provided information.'''

    # Create the collection
    collection = pystac.Collection(
        id=project_id,
        description=project_description,
        extent=extent,
        license=project_license,
        title=project_title,
        extra_fields = {
            "osc:status": project_status,
            "osc:type": "project",
            "updated": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
        },
        stac_extensions=[
            "https://stac-extensions.github.io/osc/v1.0.0/schema.json",
            "https://stac-extensions.github.io/themes/v1.0.0/schema.json",
            "https://stac-extensions.github.io/contacts/v0.1.1/schema.json"
        ]
    
    )

    # Add pre-determined links 
    collection.add_links([
        pystac.Link(rel="root", target="../../catalog.json", media_type="application/json", title="Open Science Catalog"),
        pystac.Link(rel="parent", target="../catalog.json", media_type="application/json", title="Projects"),
        # pystac.Link(rel="self", target=f"https://esa-earthcode.github.io/open-science-catalog-metadata/projects/{project_id}/collection.json", media_type="application/json"),
    ])

    return collection

Define helper functions | Create new product collection

def create_product_collection(product_id, product_title, product_description, product_extent, product_license,
                              product_keywords, product_status, product_region, product_project_id, product_project_title,
                              product_parameters=None, product_doi=None):

    collection = pystac.Collection(
            id=product_id,
            title=product_title,
            description=product_description,
            extent=product_extent,
            license=product_license,
            keywords=product_keywords,
            stac_extensions=[
                "https://stac-extensions.github.io/osc/v1.0.0/schema.json",
                "https://stac-extensions.github.io/themes/v1.0.0/schema.json",
                "https://stac-extensions.github.io/cf/v0.2.0/schema.json"
            ],
        )
    
    # Add pre-determined links 
    collection.add_links([
        pystac.Link(rel="root", target="../../catalog.json", media_type="application/json", title="Open Science Catalog"),
        pystac.Link(rel="parent", target="../catalog.json", media_type="application/json", title="Products"),
        # pystac.Link(rel="self", target=f"https://esa-earthcode.github.io/open-science-catalog-metadata/products/{project_id}/collection.json", media_type="application/json"),
        pystac.Link(rel="related", target=f"../../projects/{product_project_id}/collection.json", media_type="application/json", title=f"Project: {product_project_title}"),

    ])

    # Add extra properties
    collection.extra_fields.update({
        "osc:project": product_project_id,
        "osc:status": product_status,
        "osc:region": product_region,
        "osc:type": "product",
        "created": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "updated": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
    })

    if product_doi is not None:
        collection.extra_fields["sci:doi"] = product_doi


    if product_parameters:
        collection.extra_fields["cf:parameter"] = [{"name": p} for p in product_parameters]
    
    return collection

Define helper functions | Create new workflow record

def create_workflow_collection(workflow_id, workflow_title, 
                               workflow_description, workflow_license, workflow_extent,
                               workflow_keywords, workflow_formats, workflow_project, workflow_project_title):

    '''Create a workflow collection template from the provided information.'''

    # Create the collection

    collection = {
        'id': workflow_id,
        'type': 'Feature',
        'geometry': None,
        "conformsTo": ["http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core"],
        "properties": {
            "title": workflow_title,
            "description": workflow_description,
            "osc:type": "workflow",
            "osc:project": workflow_project,
            "osc:status": "completed",
            "formats": [{"name": f} for f in workflow_formats],
            "updated": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
            "created": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
            "keywords": workflow_keywords,
            "license": workflow_license,
            "version": "1"
        },
        "linkTemplates": [],
        "links": [
            
            {
                "rel": "root",
                "href": "../../catalog.json",
                "type": "application/json",
                "title": "Open Science Catalog"
            },            
            {
                "rel": "parent",
                "href": "../catalog.json",
                "type": "application/json",
                "title": "Workflows"
            },            
  
            {
                "rel": "related",
                "href": f"../../projects/{workflow_project}/collection.json",
                "type": "application/json",
                "title": f"Project: {workflow_project_title}"
            },
            
        ]

    }
    
    return collection

Create a metadata collection for new project

# Define id, title, description, project status, license
project_id = "worldcereal2"
project_title = "WorldCereal2"
project_description = "WorldCereal is an ESA initiative that provides global cropland and crop type maps at 10-meter resolution, offering seasonally updated data on temporary crops, croptypes (maize, winter cereals and spring cereals), and irrigation."
project_status = "completed"
project_license = 'proprietary'

# Define spatial and temporal extent
spatial_extent = pystac.SpatialExtent([[-180.0, -90.0, 180.0, 90.0]])
temporal_extent = pystac.TemporalExtent([[datetime(2021, 1, 1), datetime(2021, 12, 31, 23, 59, 59)]])
extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)

# Define links and link titles
project_link_targets = ["https://esa-worldcereal.org/en", 
                        "https://eo4society.esa.int/projects/worldcereal-global-crop-monitoring-at-field-scale/"]
project_link_relations = ["via", "via"]
project_link_titles = ["Website", "EO4Society Link"]

# Define project themes
project_themes = ["land"]

# contacts
project_contracts_info = [
    ("Zoltan Szantoi", ["technical_officer"], ["Zoltan.Szantoi@esa.int"]),
    ("VITO Remote Sensing", ["consortium_member"], None)
]
collection = create_project_collection(project_id, project_title, project_description, 
                      project_status, extent, project_license)
# add links
add_links(collection, project_link_relations, project_link_targets, project_link_titles)
## add themes
add_themes(collection, project_themes)

# Add contacts
collection.extra_fields.update({

    "contacts": [create_contract(*info) for info in project_contracts_info]
    
})
collection.validate()
collection
# save this file and copy it to the catalog/projects/{project}/collection.json
collection.save_object(dest_href='project_collection.json')
# optionally run this code to transfer the generated file to the OSC folder, ready to be commited.
!mkdir -p ../open-science-catalog-metadata-staging/projects/worldcereal2/
!cp project_collection.json ../open-science-catalog-metadata-staging/projects/worldcereal2/collection.json

Create a metadata collection for new product

product_id = "worldcereal-crop-extent-belgium2"
product_title = "WorldCereal Crop Extent - Belgium2"
product_description = "WorldCereal is an ESA initiative that provides global cropland and crop type maps at 10-meter resolution, offering seasonally updated data on temporary crops, croptypes (maize, winter cereals and spring cereals), and irrigation. This dataset provides the outputs for Belgium."
product_keywords = [
    "Crops",
    "Cereal"
]
product_status = "ongoing"
product_license = "proprietary"

# Define spatial and temporal extent
product_spatial_extent = pystac.SpatialExtent([[2.5135, 49.529, 6.156, 51.475]])
product_temporal_extent = pystac.TemporalExtent([[datetime(2021, 1, 1), datetime(2021, 12, 31, 23, 59, 59)]])
product_extent = pystac.Extent(spatial=product_spatial_extent, temporal=product_temporal_extent)
product_region = "Belgium"
product_themes = ["land"]
product_missions = [ "sentinel-2"]
product_variables = [  "crop-yield-forecast" ]
product_parameters = [  "crop-yield-forecast" ]

product_project_id = "worldcereal2"
product_project_title = "WorldCereal2"

product_doi = "https://doi.org/10.57780/s3d-83ad619"


# define links to add

product_target_relations = ['child', 'via', 'via']
product_target_links = ['https://eoresults.esa.int/stac/collections/sentinel3-ampli-ice-sheet-elevation',
                        'https://eoresults.esa.int/browser/#/external/eoresults.esa.int/stac/collections/sentinel3-ampli-ice-sheet-elevation',
                        'https://eoresults.esa.int/d/sentinel3-ampli-ice-sheet-elevation/2025/05/07/sentinel-3-ampli-user-handbook/S3_AMPLI_User_Handbook.pdf']
product_target_titles = ['PRR link', 'Access', 'Documentation']
product_collection = create_product_collection(
    product_id, product_title, product_description, product_extent, 
    product_license, product_keywords, product_status, product_region, 
    product_project_id, product_project_title, product_parameters, product_doi)
# add themes
add_themes(product_collection, product_themes)

add_product_missions(product_collection, product_missions)
add_product_variables(product_collection, product_variables)
# add links
add_links(product_collection,
          product_target_relations,
          product_target_links,
          product_target_titles
)
product_collection.validate()
product_collection
# save this file and copy it to the catalog/products/{product_id}/collection.json
product_collection.save_object(dest_href='product_collection.json')
# optionally run this code to transfer the generated file to the OSC folder, ready to be commited.
!mkdir -p ../open-science-catalog-metadata-staging/products/worldcereal-crop-extent-belgium2/
!cp product_collection.json ../open-science-catalog-metadata-staging/products/worldcereal-crop-extent-belgium2/collection.json

Create a metadata collection for new workflow

workflow_id = "worldcereal-workflow2"
workflow_title="ESA worldcereal global crop extent detector2"
workflow_description="Detects crop land at 10m resolution, trained for global use. Based on Sentinel-1 and 2 data..."
workflow_license = "proprietary"
workflow_keywords= ["agriculture", "crops"]
workflow_formats = ["GeoTIFF"]
workflow_project = "worldcereal2"
workflow_project_title = "WorldCereal2"

workflow_themes = ['land']

# Define spatial and temporal extent
spatial_extent = pystac.SpatialExtent([[-180.0, -90.0, 180.0, 90.0]])
temporal_extent = pystac.TemporalExtent([[datetime(2022, 2, 1), datetime(2026, 1, 31, 23, 59, 59)]])
workflow_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)


# add custom theme schemas

workflow_contracts_info = [
    ("Marie-Helene Rio", ["technical_officer"], ["marie-helene.rio@esa.int"]),
    ("CNR-INSTITUTE OF MARINE SCIENCES-ISMAR (IT)", ["consortium_member"], None),
    ("+ATLANTIC – Association for an Atla (PT)", ["consortium_member"], None),
]
workflow_collection = create_workflow_collection(workflow_id, workflow_title, 
                               workflow_description, workflow_license, workflow_extent,
                               workflow_keywords, workflow_formats, workflow_project, workflow_project_title)
# add contacts
workflow_collection['properties'].update({

    "contacts": [create_contract(*info) for info in workflow_contracts_info]
    
})
workflow_collection['properties']['themes'] = [
    {
        "scheme": "https://github.com/stac-extensions/osc#theme",
        "concepts": [{"id": t} for t in workflow_themes]
    }
]

for t in workflow_themes:
    workflow_collection['links'].append(
            {
                    "rel": 'related',
                    "href": f"../../{t}/land/catalog.json",
                    "type": "application/json",
                    "title": f'Theme: {t.capitalize()}'
                }
)
workflow_target_relations = ['openeo-process', 'git', 'service']
workflow_target_links = ['https://raw.githubusercontent.com/WorldCereal/worldcereal-classification/refs/tags/worldcereal_crop_extent_v1.0.1/src/worldcereal/udp/worldcereal_crop_extent.json',
                        'https://github.com/WorldCereal/worldcereal-classification.git',
                        'https://openeofed.dataspace.copernicus.eu']
workflow_target_titles = ['openEO Process Definition', 'Git source repository', 'CDSE openEO federation']

for rel, link, title in zip(workflow_target_relations, workflow_target_links, workflow_target_titles):
    workflow_collection['links'].append(
        {
                "rel": rel,
                "href": link,
                "type": "application/json",
                "title": title
            }
    )
import json
with open('record.json', 'w') as f:
    json.dump(workflow_collection, f)
# optionally run this code to transfer the generated file to the OSC folder, ready to be commited.
!mkdir -p ../open-science-catalog-metadata-staging/workflows/worldcereal-workflow2/
!cp record.json ../open-science-catalog-metadata-staging/workflows/worldcereal-workflow2/record.json