Ver código fonte

code for reading data added. pydoit and more added

Johannes Gütschow 1 ano atrás
pai
commit
e46491ecc0
12 arquivos alterados com 402 adições e 0 exclusões
  1. 6 0
      .gitignore
  2. 29 0
      dodo.py
  3. 11 0
      pyproject.toml
  4. 1 0
      requirements.txt
  5. 1 0
      requirements_dev.txt
  6. 51 0
      setup.cfg
  7. 5 0
      setup.py
  8. 9 0
      src/definitions.py
  9. 73 0
      src/helper_functions.py
  10. 121 0
      src/read_version.py
  11. 34 0
      src/read_version_datalad.py
  12. 61 0
      src/versions.py

+ 6 - 0
.gitignore

@@ -0,0 +1,6 @@
+__pycache__
+venv
+.doit.db
+.idea
+*.ipynb
+.ipynb_checkpoints

+ 29 - 0
dodo.py

@@ -0,0 +1,29 @@
+# define tasks for Andrew Cement data repository
+from doit import get_var
+
+# create virtual environment
+def task_setup_venv():
+    """Create virtual environment"""
+    return {
+        'file_dep': ['requirements_dev.txt', 'setup.cfg', 'pyproject.toml'],
+        'actions': ['python3 -m venv venv',
+                    './venv/bin/pip install --upgrade pip wheel',
+                    './venv/bin/pip install --upgrade --upgrade-strategy '
+                    'eager -e .[dev]',
+                    'touch venv',],
+        'targets': ['venv'],
+        'verbosity': 2,
+    }
+
+read_config = {
+    "version": get_var('version', None),
+}
+
+def task_read_version():
+    """ Read specific version of the data"""
+    return {
+        'actions': [f"./venv/bin/python src/read_version_datalad.py "
+                    f"--version={read_config['version']}"],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }

+ 11 - 0
pyproject.toml

@@ -0,0 +1,11 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel",
+    "setuptools_scm[toml]>=3.4"
+]
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 88
+

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+.

+ 1 - 0
requirements_dev.txt

@@ -0,0 +1 @@
+.[dev]

+ 51 - 0
setup.cfg

@@ -0,0 +1,51 @@
+[metadata]
+name = Global_CO2_from_cement_production
+version = 0.1
+author = Johannes Gütschow
+author_email = mail@johannes-guetschow.de
+description = Robbie Andrew's CO2 emissions from cement production data in PRIMAP2
+        format
+long_description = file: README.md
+long_description_content_type = text/md
+url = https://github.com/JGuetschow/Global_CO2_from_cement_production
+#project_urls =
+classifiers =
+    Development Status :: 3 - Alpha
+    Intended Audience :: Science/Research
+    Topic :: Scientific/Engineering :: Atmospheric Science
+    License :: OSI Approved :: Apache Software License
+    Natural Language :: English
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+license = Apache Software License 2.0
+license_file = LICENSE
+
+[options]
+#packages =
+python_requires = >=3.8
+setup_requires =
+    setuptools_scm
+install_requires =
+    primap2
+    pycountry
+    datalad
+    pycountry
+
+[options.extras_require]
+dev =
+    pip
+    wheel
+    primap2
+    datalad
+    black
+    ipykernel
+    jupyter
+    pycountry
+
+
+[options.package_data]
+* =
+    *.csv
+    *.nc

+ 5 - 0
setup.py

@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+
+import setuptools
+
+setuptools.setup()

+ 9 - 0
src/definitions.py

@@ -0,0 +1,9 @@
+# this file holds  definitions for folder names file names etc
+from versions import versions
+
+downloaded_data_folder = "downloaded_data"
+extracted_data_folder = "extracted_data"
+
+def get_output_filename(version: str) -> str:
+    version_info = versions[version]
+    return f"Robbie_Andrew_Cement_Production_CO2_{version_info['ver_str_short']}"

+ 73 - 0
src/helper_functions.py

@@ -0,0 +1,73 @@
+# functions for country codes etc. currently copied from UNFCCC?non-AnnexI_data
+# import from primap2 once the functionality is integrated there
+
+import pycountry
+
+custom_country_mapping_code = {}
+custom_country_mapping_name = {
+    'Bonaire, Saint Eustatius and Saba': 'BES',
+    'Cape Verde': 'CPV',
+    'Democratic Republic of the Congo': 'COD',
+    'Faeroe Islands': 'FRO',
+    'Micronesia (Federated States of)': 'FSM',
+    'Iran': 'IRN',
+    'Laos': 'LAO',
+    'Occupied Palestinian Territory': 'PSE',
+    'Swaziland': 'SWZ',
+    'Taiwan': 'TWN',
+    'Wallis and Futuna Islands': 'WLF',
+    'Global': 'EARTH',
+}
+
+def get_country_code(
+    country_name: str,
+) -> str:
+    """
+    obtain country code. If the input is a code it will be returned,
+    if the input
+    is not a three letter code a search will be performed
+
+    Parameters
+    __________
+    country_name: str
+        Country code or name to get the three-letter code for.
+
+    Returns
+    -------
+        country_code: str
+
+    """
+    # First check if it's in the list of custom codes
+    if country_name in custom_country_mapping_code:
+        country_code = country_name
+    elif country_name in custom_country_mapping_name:
+        country_code = custom_country_mapping_name[country_name]
+    else:
+        try:
+            # check if it's a 3 letter UNFCCC_GHG_data
+            country = pycountry.countries.get(alpha_3=country_name)
+            country_code = country.alpha_3
+        except:
+            try:
+                country = pycountry.countries.search_fuzzy(
+                    country_name.replace("_", " ")
+                )
+            except:
+                raise ValueError(
+                    f"Country name {country_name} can not be mapped to "
+                    f"any country code. Try using the ISO3 code directly."
+                )
+            if len(country) > 1:
+                country_code = None
+                for current_country in country:
+                    if current_country.name == country_name:
+                        country_code = current_country.alpha_3
+                if country_code is None:
+                    raise ValueError(
+                        f"Country name {country_name} has {len(country)} "
+                        f"possible results for country codes."
+                    )
+
+            country_code = country[0].alpha_3
+
+    return country_code

+ 121 - 0
src/read_version.py

@@ -0,0 +1,121 @@
+# this script reads the data for a given version and saves to primap2 native and
+# interchange format
+import pandas as pd
+import primap2 as pm2
+import os
+import argparse
+from pathlib import Path
+from helper_functions import get_country_code
+from versions import versions
+from definitions import get_output_filename, downloaded_data_folder, extracted_data_folder
+
+# handle command line parameter
+parser = argparse.ArgumentParser()
+parser.add_argument("--version", help="Version to read")
+args = parser.parse_args()
+version = args.version
+
+
+compression = dict(zlib=True, complevel=9)
+root_path = Path("..")  # Path(os.path.realpath("__file__")).parents[0].absolute()
+
+## set the configuration for conversion to primap2 format
+version_info = versions[version]
+
+coords_cols = {
+    "area": "index",
+}
+
+coords_terminologies = {"area": "ISO3", "category": "IPCC2006", "scenario": "PRIMAP"}
+
+coords_defaults = {
+    "source": "Andrew_Cement",
+    "provenance": "measured",
+    "category": "2.A.1",
+    "unit": version_info["unit"],
+    "entity": "CO2",
+    "scenario": version_info["ver_str_short"],
+}
+
+coords_value_mapping = {}
+
+filter_keep = {}
+filter_keep.update(version_info["filter_keep"])
+
+filter_remove = {}
+filter_remove.update(version_info["filter_remove"])
+
+meta_data = {
+    "references": f"{version_info['ref']}, {version_info['ref2']}",
+    "rights": "Creative Commons Attribution 4.0 International",
+    "contact": f"{version_info['contact']}",
+    "title": f"{version_info['title']} - {version_info['ver_str_long']}",
+    "comment": f"{version_info['comment']}",
+    "institution": f"{version_info['institution']}",
+}
+
+filename_and_path = (root_path / downloaded_data_folder / version_info["folder"] /
+                     version_info["filename"])
+output_folder = root_path / extracted_data_folder / version_info["folder"]
+output_file = get_output_filename(version)
+
+# read the data
+data_pd = pd.read_csv(filename_and_path)
+
+# transpose for older versions
+if version_info["transpose"]:
+    data_pd = data_pd.transpose()
+    data_pd.columns = data_pd.iloc[0]
+    # idx_to_drop =  data_pd.iloc[0].index
+    data_pd = data_pd.drop("Year", axis=0)
+
+# map country names to codes
+if not version_info["country_code"]:
+    country_names = data_pd.index.to_list()
+    country_codes = {}
+    exceptions = False
+    for country in country_names:
+        try:
+            country_codes[country] = get_country_code(country)
+        except Exception as ex:
+            print(ex)
+            exceptions = True
+    data_pd = data_pd.rename(index=country_codes)
+    if exceptions:
+        raise ValueError(
+            "Exceptions occurred during mapping of country names to codes."
+        )
+else:
+    data_pd = data_pd.drop(columns=["UN code", "Name"])
+
+# column names to str for conversion to dates
+data_pd.columns = [f"{col:.0f}" for col in data_pd.columns]
+
+# country codes to col instead of index
+data_pd = data_pd.reset_index()
+
+
+# convert to PRIMAP2 interchange format
+data_if = pm2.pm2io.convert_wide_dataframe_if(
+    data_pd,
+    coords_cols=coords_cols,
+    coords_defaults=coords_defaults,
+    coords_terminologies=coords_terminologies,
+    coords_value_mapping=coords_value_mapping,
+    filter_keep=filter_keep,
+    filter_remove=filter_remove,
+    meta_data=meta_data,
+)
+
+# convert to PRIMAP2 native format
+data_pm2 = pm2.pm2io.from_interchange_format(data_if, data_if.attrs)
+
+# convert back to IF for standardized units
+data_if = data_pm2.pr.to_interchange_format()
+
+# save data
+if not output_folder.exists():
+    output_folder.mkdir()
+pm2.pm2io.write_interchange_format(output_folder / (output_file + ".csv"), data_if)
+encoding = {var: compression for var in data_pm2.data_vars}
+data_pm2.pr.to_netcdf(output_folder / (output_file + ".nc"), encoding=encoding)

+ 34 - 0
src/read_version_datalad.py

@@ -0,0 +1,34 @@
+# script that calls datalad to run the data reading function
+
+import argparse
+import datalad.api
+from pathlib import Path
+from versions import versions
+from definitions import get_output_filename, downloaded_data_folder, extracted_data_folder
+
+# handle command line parameter
+parser = argparse.ArgumentParser()
+parser.add_argument("--version", help="Version to read")
+args = parser.parse_args()
+version = args.version
+
+root_path = Path("..")
+
+version_info = versions[version]
+input_files = [root_path / downloaded_data_folder / version_info["folder"] / version_info[
+    "filename"]]
+suffixes = ['nc', 'yaml', 'csv']
+output_file_template = (root_path / downloaded_data_folder / version_info["folder"] /
+                        get_output_filename(version))
+output_files = [f"{str(output_file_template)}.{suffix}" for suffix in suffixes]
+
+
+datalad.api.run(
+    cmd=f"./venv/bin/python3 src/read_version.py --version {version}",
+    dataset=root_path,
+    message=f"Read data for {version}.",
+    inputs=input_files,
+    outputs=output_files,
+    dry_run=True,
+    explicit=True,
+)

+ 61 - 0
src/versions.py

@@ -0,0 +1,61 @@
+# configurations for the different versions. mainly metadata
+
+versions = {
+    "v230428": {
+        'date': '28-Apr-2023',
+        'ver_str_long': 'version 230428',
+        'ver_str_short': '230428',
+        "folder": "v230428",
+        "transpose": True,
+        "filename": "0. GCP-CEM.csv",
+        'ref': '10.5281/zenodo.7081360',
+        'ref2': '10.5194/essd-11-1675-2019',
+        'title': 'Global CO2 emissions from cement production',
+        'institution': "CICERO - Center for International Climate Research",
+        'filter_keep': {},
+        'filter_remove': {},
+        'contact': "johannes.guetschow@climate-resource.com",
+        'comment': ("Published by Robbie Andrew, converted to PRIMAP2 format by "
+                    "Johannes Gütschow"),
+        'unit': 'kt * CO2 / year',
+        'country_code': True,
+    },
+    "v220915": {
+        'date': '15-Sep-2022',
+        'ver_str_long': 'version 220915',
+        'ver_str_short': '220915',
+        "folder": "v220915",
+        "transpose": True,
+        "filename": "1. Cement_emissions_data.csv",
+        'ref': '10.5281/zenodo.7081360',
+        'ref2': '10.5194/essd-11-1675-2019',
+        'title': 'Global CO2 emissions from cement production',
+        'institution': "CICERO - Center for International Climate Research",
+        'filter_keep': {},
+        'filter_remove': {},
+        'contact': "johannes.guetschow@climate-resource.com",
+        'comment': ("Published by Robbie Andrew, converted to PRIMAP2 format by "
+                    "Johannes Gütschow"),
+        'unit': 'kt * CO2 / year',
+        'country_code': False,
+    },
+    "v220516": {
+        'date': '16-May-2022',
+        'ver_str_long': 'version 220516',
+        'ver_str_short': '220516',
+        "folder": "v220516",
+        "transpose": True,
+        "filename": "1. Cement_emissions_data.csv",
+        'ref': '10.5281/zenodo.6553090',
+        'ref2': '10.5194/essd-11-1675-2019',
+        'title': 'Global CO2 emissions from cement production',
+        'institution': "CICERO - Center for International Climate Research",
+        'filter_keep': {},
+        'filter_remove': {},
+        'contact': "johannes.guetschow@climate-resource.com",
+        'comment': ("Published by Robbie Andrew, converted to PRIMAP2 format by "
+                    "Johannes Gütschow"),
+        'unit': 'kt * CO2 / year',
+        'country_code': False,
+    }
+}