🌈 The EM27 Retrieval Pipeline 1.0.0 has been released
⚙️ API Reference
Configuration

Configuration

config.json

Example File

{
    "version": "1.4",
    "general": {
        "metadata": {
            "github_repository": "tum-esm/em27-metadata-storage",
            "access_token": null
        },
        "data": {
            "ground_pressure": {
                "path": "path-to-ground-pressure-data",
                "file_regex": "^ground-pressure-$(SENSOR_ID)-$(YYYY)-$(MM)-$(DD).csv$",
                "separator": ",",
                "datetime_column": null,
                "datetime_column_format": null,
                "date_column": "UTCdate_____",
                "date_column_format": "%Y-%m-%d",
                "time_column": "UTCtime_____",
                "time_column_format": "%H:%M:%S",
                "unix_timestamp_column": null,
                "unix_timestamp_column_format": null,
                "pressure_column": "pressure",
                "pressure_column_format": "hPa"
            },
            "atmospheric_profiles": "path-to-atmospheric-profiles",
            "interferograms": "path-to-ifg-upload-directory",
            "results": "path-to-results-storage"
        }
    },
    "profiles": {
        "server": {
            "email": "...@...",
            "max_parallel_requests": 25
        },
        "scope": {
            "from_date": "2022-01-01",
            "to_date": "2022-01-05",
            "models": [
                "GGG2014",
                "GGG2020"
            ]
        },
        "GGG2020_standard_sites": [
            {
                "identifier": "mu",
                "lat": 48.151,
                "lon": 11.569,
                "from_date": "2019-01-01",
                "to_date": "2099-12-31"
            }
        ]
    },
    "retrieval": {
        "general": {
            "max_process_count": 9,
            "ifg_file_regex": "^$(SENSOR_ID)$(DATE).*\\.\\d+$",
            "queue_verbosity": "compact"
        },
        "jobs": [
            {
                "retrieval_algorithm": "proffast-1.0",
                "atmospheric_profile_model": "GGG2014",
                "sensor_ids": [
                    "ma",
                    "mb",
                    "mc",
                    "md",
                    "me"
                ],
                "from_date": "2019-01-01",
                "to_date": "2022-12-31",
                "settings": {
                    "store_binary_spectra": true,
                    "dc_min_threshold": 0.05,
                    "dc_var_threshold": 0.1,
                    "use_local_pressure_in_pcxs": true,
                    "use_ifg_corruption_filter": false,
                    "custom_ils": {
                        "ma": {
                            "channel1_me": 0.9892,
                            "channel1_pe": -0.001082,
                            "channel2_me": 0.9892,
                            "channel2_pe": -0.001082
                        }
                    },
                    "output_suffix": "template_config"
                }
            },
            {
                "retrieval_algorithm": "proffast-2.3",
                "atmospheric_profile_model": "GGG2020",
                "sensor_ids": [
                    "ma",
                    "mb",
                    "mc",
                    "md",
                    "me"
                ],
                "from_date": "2019-01-01",
                "to_date": "2099-12-31",
                "settings": {
                    "store_binary_spectra": false,
                    "dc_min_threshold": 0.05,
                    "dc_var_threshold": 0.1,
                    "use_local_pressure_in_pcxs": false,
                    "use_ifg_corruption_filter": true,
                    "custom_ils": null,
                    "output_suffix": null
                }
            }
        ]
    },
    "bundles": [
        {
            "dst_dir": "directory-to-write-the-bundles-to",
            "output_formats": [
                "csv",
                "parquet"
            ],
            "from_datetime": "2022-01-01T00:00:00Z",
            "to_datetime": "2022-12-31T23:59:59Z",
            "retrieval_algorithms": [
                "proffast-1.0",
                "proffast-2.4"
            ],
            "atmospheric_profile_models": [
                "GGG2014",
                "GGG2020"
            ],
            "sensor_ids": [
                "ma",
                "mb",
                "mc",
                "md",
                "me"
            ],
            "bundle_suffix": null,
            "retrieval_job_output_suffix": null,
            "parse_dc_timeseries": true
        }
    ]
}

Schema definition

root*(object)
A pydantic model describing the config file schema.
additional properties allowed: false
version*(constant)
Version of the retrieval pipeline which is compatible with this config file. Retrievals done with any version `1.x` will produce the same output files as retrievals done with version `1.0`. But higher version numbers might use a different config file structure and produce more output files.
allowed value: "1.4"
general*(object)
additional properties allowed: false
metadata*(union)
If not set, the pipeline will use local metadata files or abort if the local files are not found. If local files are found, they will always be preferred over the remote data even if the remote source is configured.
default: null
option 1(object)
GitHub repository where the location data is stored.
additional properties allowed: false
github_repository*(string)
GitHub repository name, e.g. `my-org/my-repo`.
regex pattern: "^[a-z0-9-_]+/[a-z0-9-_]+$"
access_token*(union)
GitHub access token with read access to the repository, only required if the repository is private.
default: null
option 1(string)
min. length: 1
option 2(null)
option 2(null)
data*(object)
Location where the input data sourced from.
additional properties allowed: false
ground_pressure*(object)
directory path and format configuration of the ground pressure files
additional properties allowed: false
path*(string)
Directory path to ground pressure files.
file_regex*(string)
A regex string to match the ground pressure file names. In this string, you can use the placeholders `$(SENSOR_ID)`, `$(YYYY)`, `$(YY)`, `$(MM)`, and `$(DD)` to make this regex target a certain station and date. The placeholder `$(DATE)` is a shortcut for `$(YYYY)$(MM)$(DD)`.
min. length: 1
examples: [ "^$(DATE).tsv$", "^$(SENSOR_ID)_$(DATE).dat$", "^ground-pressure-$(SENSOR_ID)-$(YYYY)-$(MM)-$(DD).csv$" ]
separator*(string)
Separator used in the ground pressure files. Only needed and used if the file format is `text`.
min. length: 1
max. length: 1
examples: [ ",", "\t", " ", ";" ]
datetime_column*(union)
Column name in the ground pressure files that contains the datetime.
default: null
examples: [ "datetime", "dt", "utc-datetime" ]
option 1(string)
option 2(null)
datetime_column_format*(union)
Format of the datetime column in the ground pressure files.
default: null
examples: [ "%Y-%m-%dT%H:%M:%S" ]
option 1(string)
option 2(null)
date_column*(union)
Column name in the ground pressure files that contains the date.
default: null
examples: [ "date", "d", "utc-date" ]
option 1(string)
option 2(null)
date_column_format*(union)
Format of the date column in the ground pressure files.
default: null
examples: [ "%Y-%m-%d", "%Y%m%d", "%d.%m.%Y" ]
option 1(string)
option 2(null)
time_column*(union)
Column name in the ground pressure files that contains the time.
default: null
examples: [ "time", "t", "utc-time" ]
option 1(string)
option 2(null)
time_column_format*(union)
Format of the time column in the ground pressure files.
default: null
examples: [ "%H:%M:%S", "%H:%M", "%H%M%S" ]
option 1(string)
option 2(null)
unix_timestamp_column*(union)
Column name in the ground pressure files that contains the unix timestamp.
default: null
examples: [ "unix-timestamp", "timestamp", "ts" ]
option 1(string)
option 2(null)
unix_timestamp_column_format*(union)
Format of the unix timestamp column in the ground pressure files. I.e. is the Unix timestamp in seconds, milliseconds, etc.?
default: null
option 1(enum)
allowed values: [ "s", "ms", "us", "ns" ]
option 2(null)
pressure_column*(string)
Column name in the ground pressure files that contains the pressure.
examples: [ "pressure", "p", "ground_pressure" ]
pressure_column_format*(enum)
Format of the pressure column in the ground pressure files.
allowed values: [ "hPa", "Pa" ]
atmospheric_profiles*(string)
directory path to atmospheric profile files
interferograms*(string)
directory path to ifg files
results*(string)
directory path to results
profiles*(union)
default: null
option 1(object)
Settings for vertical profiles retrieval. If `null`, the vertical profiles script will stop and log a warning
additional properties allowed: false
server*(object)
Settings for accessing the ccycle ftp server. Besides the `email` field, these can be left as default in most cases.
additional properties allowed: false
email*(string)
Email address to use to log in to the ccycle ftp server.
min. length: 3
max_parallel_requests*(integer)
Maximum number of requests to put in the queue on the ccycle server at the same time. Only when a request is finished, a new one can enter the queue.
min.: 1
max.: 200
scope*(union)
Scope of the vertical profiles to request from the ccycle ftp server. If set to `null`, the script will not request any vertical profiles besides the configured standard sites.
default: null
option 1(object)
additional properties allowed: false
from_date(string)
Date in format `YYYY-MM-DD` from which to request vertical profile data.
default: "1900-01-01"
to_date(string)
Date in format `YYYY-MM-DD` until which to request vertical profile data.
default: "2100-01-01"
models*(array)
list of data types to request from the ccycle ftp server.
#(enum)
allowed values: [ "GGG2014", "GGG2020" ]
option 2(null)
GGG2020_standard_sites*(array)
List of standard sites to request from the ccycle ftp server. The requests for these standard sites are done before any other requests so that data available for these is not rerequested for other sensors. See https://tccon-wiki.caltech.edu/Main/ObtainingGinputData#Requesting_to_be_added_as_a_standard_site for more information.
#(object)
additional properties allowed: false
identifier*(string)
The identifier on the caltech server
lat*(number)
min.: -90
max.: 90
lon*(number)
min.: -180
max.: 180
from_date*(string)
Date in format `YYYY-MM-DD` from which this standard site is active.
to_date*(string)
Date in format `YYYY-MM-DD` until which this standard site is active.
option 2(null)
retrieval*(union)
default: null
option 1(object)
Settings for automated proffast processing. If `null`, the automated proffast script will stop and log a warning
additional properties allowed: false
general*(object)
additional properties allowed: false
max_process_count(integer)
How many parallel processes to dispatch. There will be one process per sensor-day. With hyper-threaded CPUs, this can be higher than the number of physical cores.
min.: 1
max.: 128
default: 1
ifg_file_regex*(string)
A regex string to match the ifg file names. In this string, `$(SENSOR_ID)`, `$(YYYY)`, `$(YY)`, `$(MM)`, and `$(DD)` are placeholders to target a certain station and date. The placeholder `$(DATE)` is a shortcut for `$(YYYY)$(MM)$(DD)`. They don't have to be used - you can also run the retrieval on any file it finds in the directory using `.*`
min. length: 1
examples: [ "^*\\.\\d+$^$(SENSOR_ID)$(DATE).*\\.\\d+$", "^$(SENSOR_ID)-$(YYYY)-$(MM)-$(DD).*\\.nc$" ]
queue_verbosity(enum)
How much information the retrieval queue should print out. In `verbose` mode it will print out the full list of sensor-days for each step of the filtering process. This can help when figuring out why a certain sensor-day is not processed.
allowed values: [ "compact", "verbose" ]
default: "compact"
jobs*(array)
List of retrievals to run. The list will be processed sequentially.
#(object)
Settings for filtering the storage data. Only used if `config.data_sources.storage` is `true`.
additional properties allowed: false
retrieval_algorithm*(enum)
Which retrieval algorithms to use. Proffast 2.X uses the Proffast Pylot under the hood to dispatch it. Proffast 1.0 uses a custom implementation by us similar to the Proffast Pylot.
allowed values: [ "proffast-1.0", "proffast-2.2", "proffast-2.3", "proffast-2.4", "proffast-2.4.1" ]
atmospheric_profile_model*(enum)
Which vertical profiles to use for the retrieval.
allowed values: [ "GGG2014", "GGG2020" ]
sensor_ids*(array)
Sensor ids to consider in the retrieval.
#(string)
from_date*(string)
Date string in format `YYYY-MM-DD` from which to consider data in the storage directory.
to_date*(string)
Date string in format `YYYY-MM-DD` until which to consider data in the storage directory.
settings(object)
Advanced settings that only apply to this retrieval job
additional properties allowed: false
store_binary_spectra(boolean)
Whether to store the binary spectra files. These are the files that are used by the retrieval algorithm. They are not needed for the output files, but can be useful for debugging.
default: false
dc_min_threshold(number)
Value used for the `DC_min` threshold in Proffast. If not set, defaults to the Proffast default.
min.: 0.001
max.: 0.999
default: 0.05
dc_var_threshold(number)
Value used for the `DC_var` threshold in Proffast. If not set, defaults to the Proffast default.
min.: 0.001
max.: 0.999
default: 0.1
use_local_pressure_in_pcxs(boolean)
Whether to use the local pressure in the pcxs files. If not used, it will tell PCXS to use the pressure from the atmospheric profiles (set the input value in the `.inp` file to `9999.9`). If used, the pipeline computes the solar noon time using `skyfield` and averages the local pressure over the time period noon-2h to noon+2h.
default: false
use_ifg_corruption_filter(boolean)
Whether to use the ifg corruption filter. This filter is a program based on `preprocess4` and is part of the `tum-esm-utils` library: https://tum-esm-utils.netlify.app/api-reference#tum_esm_utilsinterferograms. If activated, we will only pass the interferograms to the retrieval algorithm that pass the filter - i.e. that won't cause it to crash.
default: true
custom_ils*(union)
Maps sensor IDS to ILS correction values. If not set, the pipeline will use the values published inside the Proffast Pylot codebase (https://gitlab.eudat.eu/coccon-kit/proffastpylot/-/blob/master/prfpylot/ILSList.csv?ref_type=heads).
default: null
option 1(object)
additional properties allowed: true
no schema enforced
option 2(null)
output_suffix*(union)
Suffix to append to the output folders. If not set, the pipeline output folders are named `sensorid/YYYYMMDD/`. If set, the folders are named `sensorid/YYYYMMDD_suffix/`. This is useful when having multiple retrieval jobs processing the same sensor dates with different settings.
default: null
option 1(string)
option 2(null)
option 2(null)
bundles*(union)
List of output bundling targets.
default: null
option 1(array)
#(object)
There will be one file per sensor id and atmospheric profile and retrieval algorithm combination. The final name looks like `em27-retrieval-bundle-$SENSOR_ID-$RETRIEVAL_ALGORITHM-$ATMOSPHERIC_PROFILE-$FROM_DATE-$TO_DATE$BUNDLE_SUFFIX.$OUTPUT_FORMAT`, e.g.`em27-retrieval-bundle-ma-GGG2020-proffast-2.4-20150801-20240523-v2.1.csv`. The bundle suffix is optional and can be used to distinguish between different internal datasets.
additional properties allowed: false
dst_dir*(string)
Directory to write the bundeled outputs to.
output_formats*(array)
List of output formats to write the merged output files in.
#(enum)
allowed values: [ "csv", "parquet" ]
from_datetime*(string)
Date in format `YYYY-MM-DDTHH:MM:SS` from which to bundle data
to_datetime*(string)
Date in format `YYYY-MM-DDTHH:MM:SS` to which to bundle data
retrieval_algorithms*(array)
The retrieval algorithms for which to bundle the outputs
#(enum)
allowed values: [ "proffast-1.0", "proffast-2.2", "proffast-2.3", "proffast-2.4", "proffast-2.4.1" ]
atmospheric_profile_models*(array)
The atmospheric profile models for which to bundle the outputs
#(enum)
allowed values: [ "GGG2014", "GGG2020" ]
sensor_ids*(array)
The sensor ids for which to bundle the outputs
#(string)
bundle_suffix*(union)
Suffix to append to the output bundles.
default: null
examples: [ "v2.1", "v2.2", "oco2-gradient-paper-2021" ]
option 1(string)
min. length: 1
option 2(null)
retrieval_job_output_suffix*(union)
When you ran the retrieval with a custom suffix, you can specify it here to only bundle the outputs of this suffix. Use the same value here as in the field `config.retrieval.jobs[i].settings.output_suffix`.
default: null
option 1(string)
option 2(null)
parse_dc_timeseries(boolean)
Whether to parse the DC timeseries from the results directories. This is an output only available in this Pipeline for Proffast2.4. We adapted the preprocessor to output the DC min/mean/max/variation values for each record of data. If you having issues with a low signal intensity on one or both channels, you can run the retrieval with a very low DC_min threshold and filter the data afterwards instead of having to rerun the retrieval.
default: false
option 2(null)