Browse Source

ignore datasets folder until scripts to create datasets use datalad

Johannes Gütschow 2 years ago
parent
commit
72959753f1
2 changed files with 98 additions and 0 deletions
  1. 1 0
      .gitignore
  2. 97 0
      code/UNFCCC_CRF_reader/CRF_raw_for_year.py

+ 1 - 0
.gitignore

@@ -5,3 +5,4 @@ __pycache__
 /JG_test_code/
 .doit.db
 log
+datasets

+ 97 - 0
code/UNFCCC_CRF_reader/CRF_raw_for_year.py

@@ -0,0 +1,97 @@
+"""
+This script collects all latest CRF submissions for a given year
+
+Currently it only checks the extracted_data folder and not if new
+submission are available in the downloaded data folder.
+"""
+
+import argparse
+import sys
+import primap2 as pm2
+from pathlib import Path
+from datetime import date
+
+root_path = Path(__file__).parents[2].absolute()
+root_path = root_path.resolve()
+#log_path = root_path / "log"
+code_path = root_path / "code"
+downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
+extracted_data_path = root_path / "extracted_data" / "UNFCCC"
+dataset_path = root_path / "datasets" / "UNFCCC"
+
+sys.path.append(code_path.name)
+
+from UNFCCC_CRF_reader.util import all_crf_countries
+from UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
+from UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import submission_has_been_read
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--submission_year', help='Submission round to read', type=int)
+args = parser.parse_args()
+submission_year = args.submission_year
+
+ds_all_CRF = None
+outdated_countries = []
+included_countries = []
+
+for country in all_crf_countries:
+    # determine folder
+    try:
+        country_info = get_input_and_output_files_for_country(
+            country, submission_year=submission_year, verbose=False)
+
+        # check if the latest submission has been read already
+
+        data_read = submission_has_been_read(
+            country_info["code"], country_info["name"],
+            submission_year=submission_year,
+            submission_date=country_info["date"],
+            verbose=False,
+        )
+        if not data_read:
+            print(f"Latest submission for {country} has not been read yet.")
+            outdated_countries.append(country)
+
+        # read the native format file
+        input_files = [file for file in country_info["input"] if file.suffix == ".nc"]
+
+        ds_country = pm2.open_dataset(input_files[0].as_posix())
+
+        # combine per table DS
+        if ds_all_CRF is None:
+            ds_all_CRF = ds_country
+        else:
+            ds_all_CRF = ds_all_CRF.combine_first(ds_country)
+
+        included_countries.append(country)
+
+    except Exception as ex:
+        print(f"Exception {ex} occurred for {country}")
+
+
+# Update metadata
+# not necessary
+
+# write to disc
+today = date.today()
+
+compression = dict(zlib=True, complevel=9)
+output_folder = dataset_path / f"CRF{submission_year}"
+output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
+
+if not output_folder.exists():
+    output_folder.mkdir()
+
+# write data in interchange format
+pm2.pm2io.write_interchange_format(output_folder / output_filename,
+                                   ds_all_CRF.pr.to_interchange_format())
+
+# write data in native PRIMAP2 format
+encoding = {var: compression for var in ds_all_CRF.data_vars}
+ds_all_CRF.pr.to_netcdf(output_folder / (output_filename + ".nc"),
+                      encoding=encoding)
+
+# show info
+print(f"The following countries are included in the dataset: {included_countries}")
+print(f"The following countries have updated submission not yet read "
+      f"and not included in the dataset: {outdated_countries}")