Quellcode durchsuchen

Worked on CRT1 specification including some code chnages. Now ready for first read

Johannes Gütschow vor 2 Monaten
Ursprung
Commit
79aa7bfc80

Datei-Diff unterdrückt, da er zu groß ist
+ 393 - 130
src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/crt1_specification.py


+ 106 - 33
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_devel.py

@@ -74,6 +74,7 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
     exceptions = []
     unknown_categories = []
     last_row_info = []
+    empty_tables = []
     ds_all = None
     print(
         f"{submission_type} test reading for {submission_type}{submission_year}. "
@@ -213,42 +214,52 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
                         submission_type=submission_type,
                     )
 
-                    # now convert to native PRIMAP2 format
-                    ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
-
-                    # if individual data for emissions and removals / recovery exist
-                    # combine them
+                    # skip empty tables
                     if (
-                        ("CO2 removals" in ds_table_pm2.data_vars)
-                        and ("CO2 emissions" in ds_table_pm2.data_vars)
-                        and "CO2" not in ds_table_pm2.data_vars
+                        not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
+                        .isna()
+                        .all(axis=None)
                     ):
-                        # we can just sum to CO2 as we made sure that it doesn't exist.
-                        # If we have CO2 and removals but not emissions, CO2 already has
-                        # removals subtracted and we do nothing here
-                        ds_table_pm2["CO2"] = ds_table_pm2[
-                            ["CO2 emissions", "CO2 removals"]
-                        ].pr.sum(dim="entity", skipna=True, min_count=1)
-                        ds_table_pm2["CO2"].attrs["entity"] = "CO2"
-
-                    if (
-                        ("CH4 removals" in ds_table_pm2.data_vars)
-                        and ("CH4 emissions" in ds_table_pm2.data_vars)
-                        and "CH4" not in ds_table_pm2.data_vars
-                    ):
-                        # we can just sum to CH4 as we made sure that it doesn't exist.
-                        # If we have CH4 and removals but not emissions, CH4 already has
-                        # removals subtracted and we do nothing here
-                        ds_table_pm2["CH4"] = ds_table_pm2[
-                            ["CH4 emissions", "CH4 removals"]
-                        ].pr.sum(dim="entity", skipna=True, min_count=1)
-                        ds_table_pm2["CH4"].attrs["entity"] = "CH4"
-
-                    # combine per table DS
-                    if ds_all is None:
-                        ds_all = ds_table_pm2
+                        # now convert to native PRIMAP2 format
+                        ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
+
+                        # if individual data for emissions and removals / recovery exist
+                        # combine them
+                        if (
+                            ("CO2 removals" in ds_table_pm2.data_vars)
+                            and ("CO2 emissions" in ds_table_pm2.data_vars)
+                            and "CO2" not in ds_table_pm2.data_vars
+                        ):
+                            # we can just sum to CO2 as we made sure that it doesn't
+                            # exist.
+                            # If we have CO2 and removals but not emissions, CO2 already
+                            # has removals subtracted and we do nothing here
+                            ds_table_pm2["CO2"] = ds_table_pm2[
+                                ["CO2 emissions", "CO2 removals"]
+                            ].pr.sum(dim="entity", skipna=True, min_count=1)
+                            ds_table_pm2["CO2"].attrs["entity"] = "CO2"
+
+                        if (
+                            ("CH4 removals" in ds_table_pm2.data_vars)
+                            and ("CH4 emissions" in ds_table_pm2.data_vars)
+                            and "CH4" not in ds_table_pm2.data_vars
+                        ):
+                            # we can just sum to CH4 as we made sure that it doesn't
+                            # exist.
+                            # If we have CH4 and removals but not emissions, CH4 already
+                            # has removals subtracted and we do nothing here
+                            ds_table_pm2["CH4"] = ds_table_pm2[
+                                ["CH4 emissions", "CH4 removals"]
+                            ].pr.sum(dim="entity", skipna=True, min_count=1)
+                            ds_table_pm2["CH4"].attrs["entity"] = "CH4"
+
+                        # combine per table DS
+                        if ds_all is None:
+                            ds_all = ds_table_pm2
+                        else:
+                            ds_all = ds_all.combine_first(ds_table_pm2)
                     else:
-                        ds_all = ds_all.combine_first(ds_table_pm2)
+                        empty_tables.append([table, current_country_code, data_year])
                 except Exception as e:
                     message = (
                         f"Error occurred when converting table {table} for"
@@ -291,6 +302,21 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
         print(f"Data found in the last row. Saving log to " f"{log_location}")
         save_last_row_info(last_row_info, log_location)
 
+    if len(empty_tables) > 0:
+        today = date.today()
+        if country_code is not None:
+            log_location = (
+                output_folder / f"{data_year}_empty_tables_{country_code}_"
+                f"{today.strftime('%Y-%m-%d')}.csv"
+            )
+        else:
+            log_location = (
+                output_folder / f"{data_year}_empty_tables_"
+                f"{today.strftime('%Y-%m-%d')}.csv"
+            )
+        print(f"Empty tables found:. Save log to {log_location}")
+        save_empty_tables_info(empty_tables, log_location)
+
     # write exceptions
     f_ex = open(
         output_folder / f"{data_year}_exceptions_{today.strftime('%Y-%m-%d')}.txt", "w"
@@ -447,3 +473,50 @@ def save_last_row_info(
         processed_last_row_info, columns=["Table", "Country", "Categories"]
     )
     df_processed_lost_row_info.to_csv(file, index=False)
+
+
+def save_empty_tables_info(
+    empty_tables: list[list],
+    file: Path,
+) -> None:
+    """
+    Save information on empty tables to a csv file.
+
+    Parameters
+    ----------
+    empty_tables: List[List]
+        List of lists with information on the empty tables.
+        (which table, country and year)
+
+    file: pathlib.Path
+        File including path where the data should be stored
+
+    """
+    # process unknown categories
+    df_empty_tables = pd.DataFrame(empty_tables, columns=["Table", "Country", "Year"])
+
+    processed_tables = []
+    all_tables = df_empty_tables["Table"].unique()
+    all_years = set(df_empty_tables["Year"].unique())
+    all_years = set([year for year in all_years if isinstance(year, int)])
+    all_years = set([year for year in all_years if int(year) > 1989])  # noqa: PLR2004
+    for table in all_tables:
+        df_current_table = df_empty_tables[df_empty_tables["Table"] == table]
+        all_countries = df_current_table["Country"].unique()
+        countries_table = ""
+        for country in all_countries:
+            years_country = df_current_table[df_current_table["Country"] == country][
+                "Year"
+            ].unique()
+            if set(years_country) == all_years:
+                countries_table = f"{countries_table}; {country}"
+            else:
+                countries_table = f"{countries_table}; {country} ({years_country})"
+        processed_tables.append([table, countries_table])
+
+    if not file.parents[1].exists():
+        file.parents[1].mkdir()
+    if not file.parents[0].exists():
+        file.parents[0].mkdir()
+    df_processed_tables = pd.DataFrame(processed_tables, columns=["Table", "Countries"])
+    df_processed_tables.to_csv(file, index=False)

+ 61 - 34
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_prod.py

@@ -28,7 +28,11 @@ from .unfccc_crf_reader_core import (
     get_latest_version_for_country,
     read_crf_table,
 )
-from .unfccc_crf_reader_devel import save_last_row_info, save_unknown_categories_info
+from .unfccc_crf_reader_devel import (
+    save_empty_tables_info,
+    save_last_row_info,
+    save_unknown_categories_info,
+)
 from .util import NoCRFFilesError, all_crf_countries
 
 # functions:
@@ -158,6 +162,7 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
     if read_data or re_read:
         unknown_categories = []
         last_row_info = []
+        empty_tables = []
         for table in tables:
             # read table for all years
             ds_table, new_unknown_categories, new_last_row_info = read_crf_table(
@@ -202,42 +207,51 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
                 submission_type=submission_type,
             )
 
-            # now convert to native PRIMAP2 format
-            ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
-
-            # if individual data for emissions and removals / recovery exist combine
-            # them
+            # skip empty tables
             if (
-                ("CO2 removals" in ds_table_pm2.data_vars)
-                and ("CO2 emissions" in ds_table_pm2.data_vars)
-                and "CO2" not in ds_table_pm2.data_vars
+                not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
+                .isna()
+                .all(axis=None)
             ):
-                # we can just sum to CO2 as we made sure that it doesn't exist.
-                # If we have CO2 and removals but not emissions, CO2 already has
-                # removals subtracted and we do nothing here
-                ds_table_pm2["CO2"] = ds_table_pm2[
-                    ["CO2 emissions", "CO2 removals"]
-                ].pr.sum(dim="entity", skipna=True, min_count=1)
-                ds_table_pm2["CO2"].attrs["entity"] = "CO2"
-
-            if (
-                ("CH4 removals" in ds_table_pm2.data_vars)
-                and ("CH4 emissions" in ds_table_pm2.data_vars)
-                and "CH4" not in ds_table_pm2.data_vars
-            ):
-                # we can just sum to CH4 as we made sure that it doesn't exist.
-                # If we have CH4 and removals but not emissions, CH4 already has
-                # removals subtracted and we do nothing here
-                ds_table_pm2["CH4"] = ds_table_pm2[
-                    ["CH4 emissions", "CH4 removals"]
-                ].pr.sum(dim="entity", skipna=True, min_count=1)
-                ds_table_pm2["CH4"].attrs["entity"] = "CH4"
-
-            # combine per table DS
-            if ds_all is None:
-                ds_all = ds_table_pm2
+                # now convert to native PRIMAP2 format
+                ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
+
+                # if individual data for emissions and removals / recovery exist combine
+                # them
+                if (
+                    ("CO2 removals" in ds_table_pm2.data_vars)
+                    and ("CO2 emissions" in ds_table_pm2.data_vars)
+                    and "CO2" not in ds_table_pm2.data_vars
+                ):
+                    # we can just sum to CO2 as we made sure that it doesn't exist.
+                    # If we have CO2 and removals but not emissions, CO2 already has
+                    # removals subtracted and we do nothing here
+                    ds_table_pm2["CO2"] = ds_table_pm2[
+                        ["CO2 emissions", "CO2 removals"]
+                    ].pr.sum(dim="entity", skipna=True, min_count=1)
+                    ds_table_pm2["CO2"].attrs["entity"] = "CO2"
+
+                if (
+                    ("CH4 removals" in ds_table_pm2.data_vars)
+                    and ("CH4 emissions" in ds_table_pm2.data_vars)
+                    and "CH4" not in ds_table_pm2.data_vars
+                ):
+                    # we can just sum to CH4 as we made sure that it doesn't exist.
+                    # If we have CH4 and removals but not emissions, CH4 already has
+                    # removals subtracted and we do nothing here
+                    ds_table_pm2["CH4"] = ds_table_pm2[
+                        ["CH4 emissions", "CH4 removals"]
+                    ].pr.sum(dim="entity", skipna=True, min_count=1)
+                    ds_table_pm2["CH4"].attrs["entity"] = "CH4"
+
+                # combine per table DS
+                if ds_all is None:
+                    ds_all = ds_table_pm2
+                else:
+                    ds_all = ds_all.combine_first(ds_table_pm2)
             else:
-                ds_all = ds_all.combine_first(ds_table_pm2)
+                # log that table is empty
+                empty_tables.append(table)
 
         # check if there were log messages.
         save_data = True
@@ -269,6 +283,19 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
             )
             save_last_row_info(last_row_info, log_location)
 
+        if len(empty_tables) > 0:
+            today = date.today()
+            log_location = (
+                log_path
+                / f"{submission_type}{submission_year}"
+                / f"{country_code}_empty_tables_{today.strftime('%Y-%m-%d')}.csv"
+            )
+            print(
+                f"Empty tables found for {country_code}: "
+                f"{empty_tables}. Save log to {log_location}"
+            )
+            save_empty_tables_info(empty_tables, log_location)
+
         if save_data:
             compression = dict(zlib=True, complevel=9)
             output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.