Parcourir la source

ignore datasets folder until scripts to create datasets use datalad

Johannes Gütschow il y a 2 ans
2 fichiers modifiés avec 98 ajouts et 0 suppressions
  1. 1 0
  2. 97 0

+ 1 - 0

@@ -5,3 +5,4 @@ __pycache__

+ 97 - 0

@@ -0,0 +1,97 @@
+This script collects all latest CRF submissions for a given year
+Currently it only checks the extracted_data folder and not if new
+submission are available in the downloaded data folder.
+import argparse
+import sys
+import primap2 as pm2
+from pathlib import Path
+from datetime import date
+root_path = Path(__file__).parents[2].absolute()
+root_path = root_path.resolve()
+#log_path = root_path / "log"
+code_path = root_path / "code"
+downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
+extracted_data_path = root_path / "extracted_data" / "UNFCCC"
+dataset_path = root_path / "datasets" / "UNFCCC"
+from UNFCCC_CRF_reader.util import all_crf_countries
+from UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
+from UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import submission_has_been_read
+parser = argparse.ArgumentParser()
+parser.add_argument('--submission_year', help='Submission round to read', type=int)
+args = parser.parse_args()
+submission_year = args.submission_year
+ds_all_CRF = None
+outdated_countries = []
+included_countries = []
+for country in all_crf_countries:
+    # determine folder
+    try:
+        country_info = get_input_and_output_files_for_country(
+            country, submission_year=submission_year, verbose=False)
+        # check if the latest submission has been read already
+        data_read = submission_has_been_read(
+            country_info["code"], country_info["name"],
+            submission_year=submission_year,
+            submission_date=country_info["date"],
+            verbose=False,
+        )
+        if not data_read:
+            print(f"Latest submission for {country} has not been read yet.")
+            outdated_countries.append(country)
+        # read the native format file
+        input_files = [file for file in country_info["input"] if file.suffix == ".nc"]
+        ds_country = pm2.open_dataset(input_files[0].as_posix())
+        # combine per table DS
+        if ds_all_CRF is None:
+            ds_all_CRF = ds_country
+        else:
+            ds_all_CRF = ds_all_CRF.combine_first(ds_country)
+        included_countries.append(country)
+    except Exception as ex:
+        print(f"Exception {ex} occurred for {country}")
+# Update metadata
+# not necessary
+# write to disc
+today =
+compression = dict(zlib=True, complevel=9)
+output_folder = dataset_path / f"CRF{submission_year}"
+output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
+if not output_folder.exists():
+    output_folder.mkdir()
+# write data in interchange format
+pm2.pm2io.write_interchange_format(output_folder / output_filename,
+# write data in native PRIMAP2 format
+encoding = {var: compression for var in ds_all_CRF.data_vars} / (output_filename + ".nc"),
+                      encoding=encoding)
+# show info
+print(f"The following countries are included in the dataset: {included_countries}")
+print(f"The following countries have updated submission not yet read "
+      f"and not included in the dataset: {outdated_countries}")