Browse Source

fix DI reader to work with poetry setup and automatically download data

Johannes Gütschow 9 months ago
parent
commit
d0f1e4a552

+ 0 - 1
LICENCE

@@ -1 +0,0 @@
-To be decided by project implementer

+ 3 - 3
README.md

@@ -124,9 +124,9 @@ The code has not been tested under Windows and Mac OS.
 ### Update BUR, NC, and NDC submissions
 The maintainers of this repository will update the list of submissions and the downloaded pdf files frequently. However, in some cases you might want to have the data early and do the download yourself. To avoid merge conflicts, please do this on a clean branch in your fork and make sure your branch is in sync with `main`.
 
-* **BUR**: To update the list of submissions run `make update-bur` in the main project folder. This will create a new list of submissions. To actually download the files run `make download-bur`.
-* **NC**: To update the list of submissions run `make update-nc` in the main project folder. This will create a new list of submissions. To actually download the files run `make download-nc`.
-* **NDC**: For the NDC submissions we use the list published in [openclimatedata/ndcs](https://github.com/openclimatedata/ndcs) which receives daily updates. To  download the files run `make download-ndc`.
+* **BUR**: To update the list of submissions run `poetry run doit update_bur` in the main project folder. This will create a new list of submissions. To actually download the files run `poetry run doit  download_bur`.
+* **NC**: To update the list of submissions run `poetry run doit update_nc` in the main project folder. This will create a new list of submissions. To actually download the files run `poetry run doit download_nc`.
+* **NDC**: For the NDC submissions we use the list published in [openclimatedata/ndcs](https://github.com/openclimatedata/ndcs) which receives daily updates. To  download the files run `poetry run doit download_ndc` (currently not working due to a data format change).
 
 All download scripts create files listing the new downloads in the folder *downloaded_data/UNFCCC*. the filenames use the format *00\_new\_downloads\_\<type\>-YYYY-MM-DD.csv* where *\<type\>* is *bur*, *nc*, or *ndc*. Currently, only one file per type and day is stored, so if you run the download script more than once on a day you will overwrite your first file (likely with an empty file as you have already downloaded everything) (see also [issue #2](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/issues/2)).
 

+ 22 - 7
src/unfccc_ghg_data/unfccc_crf_reader/crf_raw_for_year.py

@@ -5,18 +5,21 @@ Reads the latest data from the extracted data folder for each country.
 Notifies the user if new data are available in the downloaded_data folder
 which have not yet been read.
 
-Data are saved in the datasets/UNFCCC/CRFYYYY folder.
+Data are saved in the datasets/UNFCCC/CRFYYYY/CRTX folder.
+
+TODO: sort importing and move to datasets folder
+TODO: add datalad get to obtain the input files
 """
 
-# TODO: sort importing and move to datasets folder
 
 import argparse
 from datetime import date
 from pathlib import Path
 
+import datalad.api
 import primap2 as pm2
 
-from unfccc_ghg_data.helper import dataset_path_UNFCCC
+from unfccc_ghg_data.helper import all_countries, dataset_path_UNFCCC
 from unfccc_ghg_data.unfccc_crf_reader.unfccc_crf_reader_prod import (
     get_input_and_output_files_for_country,
     submission_has_been_read,
@@ -26,18 +29,27 @@ from unfccc_ghg_data.unfccc_crf_reader.util import all_crf_countries
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--submission_year", help="Submission round to read", type=int)
+    parser.add_argument("--type", help="CRF or CRT tables", default="CRF")
     args = parser.parse_args()
     submission_year = args.submission_year
+    type = args.type
+
+    if type == "CRF":
+        countries = all_crf_countries
+    elif type == "CRT":
+        countries = all_countries
+    else:
+        raise ValueError("Type must be CRF or CRT")  # noqa: TRY003
 
     ds_all_CRF = None
     outdated_countries = []
     included_countries = []
 
-    for country in all_crf_countries:
+    for country in countries:
         # determine folder
         try:
             country_info = get_input_and_output_files_for_country(
-                country, submission_year=submission_year, verbose=False
+                country, submission_year=submission_year, type=type, verbose=False
             )
 
             # check if the latest submission has been read already
@@ -47,6 +59,7 @@ if __name__ == "__main__":
                 country_info["name"],
                 submission_year=submission_year,
                 submission_date=country_info["date"],
+                type=type,
                 verbose=False,
             )
             if not data_read:
@@ -61,6 +74,8 @@ if __name__ == "__main__":
                 file for file in country_info["output"] if Path(file).suffix == ".nc"
             ]
 
+            datalad.api.get(input_files)
+
             ds_country = pm2.open_dataset(input_files[0])
 
             # combine per table DS
@@ -81,8 +96,8 @@ if __name__ == "__main__":
     today = date.today()
 
     compression = dict(zlib=True, complevel=9)
-    output_folder = dataset_path_UNFCCC / f"CRF{submission_year}"
-    output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
+    output_folder = dataset_path_UNFCCC / f"{type}{submission_year}"
+    output_filename = f"{type}{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
 
     if not output_folder.exists():
         output_folder.mkdir()

+ 1 - 1
src/unfccc_ghg_data/unfccc_di_reader/read_unfccc_di_for_country.py

@@ -26,6 +26,6 @@ if __name__ == "__main__":
         read_subsectors=False,  # not applicable as we read all categories
         date_str=date_str,
         pm2if_specifications=None,  # automatically use the right specs for AI and NAI
-        default_gwp=None,  # automatically uses right default GWP for AI and NAI
+        use_gwp=None,  # automatically uses right default GWP for AI and NAI
         debug=False,
     )

+ 4 - 4
src/unfccc_ghg_data/unfccc_di_reader/unfccc_di_reader_core.py

@@ -113,7 +113,7 @@ def read_UNFCCC_DI_for_country(  # noqa: PLR0913
     data_if = convert_DI_data_to_pm2_if(
         data=data_df,
         pm2if_specifications=deepcopy(pm2if_specifications),
-        default_gwp=use_gwp,
+        use_gwp=use_gwp,
         date_str=date_str,
         debug=debug,
     )
@@ -406,7 +406,7 @@ def read_UNFCCC_DI_for_country_df_zenodo(
 def convert_DI_data_to_pm2_if(  # noqa: PLR0912, PLR0915
     data: pd.DataFrame,
     pm2if_specifications: Optional[dict] = None,
-    default_gwp: Optional[str] = None,
+    use_gwp: Optional[str] = None,
     date_str: Optional[str] = None,
     debug: bool = False,
 ) -> pd.DataFrame:
@@ -516,10 +516,10 @@ def convert_DI_data_to_pm2_if(  # noqa: PLR0912, PLR0915
         to_replace=r"(.*) CO2 equivalent", value=r"\1CO2eq", regex=True
     )
     row_idx_co2eq = data_temp["unit"].str.endswith("CO2eq")
-    if default_gwp is not None:
+    if use_gwp is not None:
         # convert all with GWPs given in input
         data_temp.loc[row_idx_co2eq, "gas"] = (
-            data_temp.loc[row_idx_co2eq, "gas"] + f" ({default_gwp})"
+            data_temp.loc[row_idx_co2eq, "gas"] + f" ({use_gwp})"
         )
     elif ai_dataset:
         # convert with AR4

+ 4 - 4
src/unfccc_ghg_data/unfccc_di_reader/unfccc_di_reader_datalad.py

@@ -48,7 +48,7 @@ def read_DI_for_country_datalad(
     script = script.relative_to(root_path)
 
     cmd = (
-        f"./venv/bin/python3 {script.as_posix()} --country={country_info['code']} "
+        f"python3 {script.as_posix()} --country={country_info['code']} "
         f"--date={date_str}"
     )
     try:
@@ -101,7 +101,7 @@ def process_DI_for_country_datalad(
     script = script.relative_to(root_path)
 
     cmd = (
-        f"./venv/bin/python3 {script.as_posix()} --country={country_info['code']} "
+        f"python3 {script.as_posix()} --country={country_info['code']} "
         f"--date={date_str}"
     )
     try:
@@ -152,7 +152,7 @@ def read_DI_for_country_group_datalad(
     script = code_path / "unfccc_di_reader" / "read_unfccc_di_for_country_group.py"
     script = script.relative_to(root_path)
 
-    cmd = f"./venv/bin/python3 {script.as_posix()} "
+    cmd = f"python3 {script.as_posix()} "
     if annexI:
         cmd = cmd + " --annexI"
 
@@ -206,7 +206,7 @@ def process_DI_for_country_group_datalad(
     script = code_path / "unfccc_di_reader" / "process_unfccc_di_for_country_group.py"
     script = script.relative_to(root_path)
 
-    cmd = f"./venv/bin/python3 {script.as_posix()} "
+    cmd = f"python3 {script.as_posix()} "
     if annexI:
         cmd = cmd + " --annexI"
     if date_str is not None:

+ 17 - 4
src/unfccc_ghg_data/unfccc_di_reader/unfccc_di_reader_io.py

@@ -3,6 +3,7 @@ Input and output functions for the DI reader
 
 Saving single country datasets and country groups datasets
 """
+import datalad.api
 import primap2 as pm2
 import xarray as xr
 from dask.base import tokenize
@@ -68,7 +69,7 @@ def save_DI_country_data(
 
     # primap2 native format
     filename_hash_nc = filename_hash.parent / (filename_hash.name + ".nc")
-    if not filename_hash_nc.exists():
+    if not (filename_hash_nc.exists() or filename_hash_nc.is_symlink()):
         # if parent dir does not exist create it
         if not filename_hash.parent.exists():
             filename_hash.parent.mkdir()
@@ -77,15 +78,21 @@ def save_DI_country_data(
         compression = dict(zlib=True, complevel=9)
         encoding = {var: compression for var in data_pm2.data_vars}
         data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
+    elif not filename_hash_nc.exists() and filename_hash_nc.is_symlink():
+        # This means that we have a broken symlink and need to download the data
+        datalad.api.get(filename_hash_nc)
 
     # primap2 IF
     filename_hash_csv = filename_hash.parent / (filename_hash.name + ".csv")
-    if not filename_hash_csv.exists():
+    if not (filename_hash_csv.exists() or filename_hash_csv.is_symlink()):
         # save the data
         print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
         pm2.pm2io.write_interchange_format(filename_hash, data_if)
     else:
         print(f"Data unchanged for {country_code}. Create symlinks.")
+        if not filename_hash_csv.exists() and filename_hash_csv.is_symlink():
+            # This means that we have a broken symlink and need to download the data
+            datalad.api.get(filename_hash_csv)
 
     # get the filename with the date
     filename_date = root_path / determine_filename(country_code, date_str, raw)
@@ -159,7 +166,7 @@ def save_DI_dataset(
     )
     # primap2 native format
     filename_hash_nc = filename_hash.parent / (filename_hash.name + ".nc")
-    if not filename_hash_nc.exists():
+    if not (filename_hash_nc.exists() or filename_hash_nc.is_symlink()):
         # if parent dir does not exist create it
         # TODO double, also in determine_dataset_filename. same for country data
         if not filename_hash.parent.exists():
@@ -169,15 +176,21 @@ def save_DI_dataset(
         compression = dict(zlib=True, complevel=9)
         encoding = {var: compression for var in data_pm2.data_vars}
         data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
+    elif not filename_hash_nc.exists() and filename_hash_nc.is_symlink():
+        # This means that we have a broken symlink and need to download the data
+        datalad.api.get(filename_hash_nc)
 
     # primap2 IF
     filename_hash_csv = filename_hash.parent / (filename_hash.name + ".csv")
-    if not filename_hash_csv.exists():
+    if not (filename_hash_csv.exists() or filename_hash_csv.is_symlink()):
         # save the data
         print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
         pm2.pm2io.write_interchange_format(filename_hash, data_if)
     else:
         print(f"Data unchanged for {country_group}. Create symlinks.")
+        if not filename_hash_csv.exists() and filename_hash_csv.is_symlink():
+            # This means that we have a broken symlink and need to download the data
+            datalad.api.get(filename_hash_csv)
 
     # get the filename with the date
     filename_date = root_path / determine_dataset_filename(