vor 2 Monaten · 79aa7bfc80
--- a/src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/crt1_specification.py
+++ b/src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/crt1_specification.py
--- a/src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_devel.py
+++ b/src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_devel.py
@@ -74,6 +74,7 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
 
				     exceptions = []
			
 
				     unknown_categories = []
			
 
				     last_row_info = []
			
 
				+    empty_tables = []
			
 
				     ds_all = None
			
 
				     print(
			
 
				         f"{submission_type} test reading for {submission_type}{submission_year}. "
			
@@ -213,42 +214,52 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
 
				                         submission_type=submission_type,
			
 
				                     )
			
 
				 
			
 
				-                    # now convert to native PRIMAP2 format
			
 
				-                    ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
			
 
				-
			
 
				-                    # if individual data for emissions and removals / recovery exist
			
 
				-                    # combine them
			
 
				+                    # skip empty tables
			
 
				                     if (
			
 
				-                        ("CO2 removals" in ds_table_pm2.data_vars)
			
 
				-                        and ("CO2 emissions" in ds_table_pm2.data_vars)
			
 
				-                        and "CO2" not in ds_table_pm2.data_vars
			
 
				+                        not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
			
 
				+                        .isna()
			
 
				+                        .all(axis=None)
			
 
				                     ):
			
 
				-                        # we can just sum to CO2 as we made sure that it doesn't exist.
			
 
				-                        # If we have CO2 and removals but not emissions, CO2 already has
			
 
				-                        # removals subtracted and we do nothing here
			
 
				-                        ds_table_pm2["CO2"] = ds_table_pm2[
			
 
				-                            ["CO2 emissions", "CO2 removals"]
			
 
				-                        ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				-                        ds_table_pm2["CO2"].attrs["entity"] = "CO2"
			
 
				-
			
 
				-                    if (
			
 
				-                        ("CH4 removals" in ds_table_pm2.data_vars)
			
 
				-                        and ("CH4 emissions" in ds_table_pm2.data_vars)
			
 
				-                        and "CH4" not in ds_table_pm2.data_vars
			
 
				-                    ):
			
 
				-                        # we can just sum to CH4 as we made sure that it doesn't exist.
			
 
				-                        # If we have CH4 and removals but not emissions, CH4 already has
			
 
				-                        # removals subtracted and we do nothing here
			
 
				-                        ds_table_pm2["CH4"] = ds_table_pm2[
			
 
				-                            ["CH4 emissions", "CH4 removals"]
			
 
				-                        ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				-                        ds_table_pm2["CH4"].attrs["entity"] = "CH4"
			
 
				-
			
 
				-                    # combine per table DS
			
 
				-                    if ds_all is None:
			
 
				-                        ds_all = ds_table_pm2
			
 
				+                        # now convert to native PRIMAP2 format
			
 
				+                        ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
			
 
				+
			
 
				+                        # if individual data for emissions and removals / recovery exist
			
 
				+                        # combine them
			
 
				+                        if (
			
 
				+                            ("CO2 removals" in ds_table_pm2.data_vars)
			
 
				+                            and ("CO2 emissions" in ds_table_pm2.data_vars)
			
 
				+                            and "CO2" not in ds_table_pm2.data_vars
			
 
				+                        ):
			
 
				+                            # we can just sum to CO2 as we made sure that it doesn't
			
 
				+                            # exist.
			
 
				+                            # If we have CO2 and removals but not emissions, CO2 already
			
 
				+                            # has removals subtracted and we do nothing here
			
 
				+                            ds_table_pm2["CO2"] = ds_table_pm2[
			
 
				+                                ["CO2 emissions", "CO2 removals"]
			
 
				+                            ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				+                            ds_table_pm2["CO2"].attrs["entity"] = "CO2"
			
 
				+
			
 
				+                        if (
			
 
				+                            ("CH4 removals" in ds_table_pm2.data_vars)
			
 
				+                            and ("CH4 emissions" in ds_table_pm2.data_vars)
			
 
				+                            and "CH4" not in ds_table_pm2.data_vars
			
 
				+                        ):
			
 
				+                            # we can just sum to CH4 as we made sure that it doesn't
			
 
				+                            # exist.
			
 
				+                            # If we have CH4 and removals but not emissions, CH4 already
			
 
				+                            # has removals subtracted and we do nothing here
			
 
				+                            ds_table_pm2["CH4"] = ds_table_pm2[
			
 
				+                                ["CH4 emissions", "CH4 removals"]
			
 
				+                            ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				+                            ds_table_pm2["CH4"].attrs["entity"] = "CH4"
			
 
				+
			
 
				+                        # combine per table DS
			
 
				+                        if ds_all is None:
			
 
				+                            ds_all = ds_table_pm2
			
 
				+                        else:
			
 
				+                            ds_all = ds_all.combine_first(ds_table_pm2)
			
 
				                     else:
			
 
				-                        ds_all = ds_all.combine_first(ds_table_pm2)
			
 
				+                        empty_tables.append([table, current_country_code, data_year])
			
 
				                 except Exception as e:
			
 
				                     message = (
			
 
				                         f"Error occurred when converting table {table} for"
			
@@ -291,6 +302,21 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
 
				         print(f"Data found in the last row. Saving log to " f"{log_location}")
			
 
				         save_last_row_info(last_row_info, log_location)
			
 
				 
			
 
				+    if len(empty_tables) > 0:
			
 
				+        today = date.today()
			
 
				+        if country_code is not None:
			
 
				+            log_location = (
			
 
				+                output_folder / f"{data_year}_empty_tables_{country_code}_"
			
 
				+                f"{today.strftime('%Y-%m-%d')}.csv"
			
 
				+            )
			
 
				+        else:
			
 
				+            log_location = (
			
 
				+                output_folder / f"{data_year}_empty_tables_"
			
 
				+                f"{today.strftime('%Y-%m-%d')}.csv"
			
 
				+            )
			
 
				+        print(f"Empty tables found:. Save log to {log_location}")
			
 
				+        save_empty_tables_info(empty_tables, log_location)
			
 
				+
			
 
				     # write exceptions
			
 
				     f_ex = open(
			
 
				         output_folder / f"{data_year}_exceptions_{today.strftime('%Y-%m-%d')}.txt", "w"
			
@@ -447,3 +473,50 @@ def save_last_row_info(
 
				         processed_last_row_info, columns=["Table", "Country", "Categories"]
			
 
				     )
			
 
				     df_processed_lost_row_info.to_csv(file, index=False)
			
 
				+
			
 
				+
			
 
				+def save_empty_tables_info(
			
 
				+    empty_tables: list[list],
			
 
				+    file: Path,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Save information on empty tables to a csv file.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    empty_tables: List[List]
			
 
				+        List of lists with information on the empty tables.
			
 
				+        (which table, country and year)
			
 
				+
			
 
				+    file: pathlib.Path
			
 
				+        File including path where the data should be stored
			
 
				+
			
 
				+    """
			
 
				+    # process unknown categories
			
 
				+    df_empty_tables = pd.DataFrame(empty_tables, columns=["Table", "Country", "Year"])
			
 
				+
			
 
				+    processed_tables = []
			
 
				+    all_tables = df_empty_tables["Table"].unique()
			
 
				+    all_years = set(df_empty_tables["Year"].unique())
			
 
				+    all_years = set([year for year in all_years if isinstance(year, int)])
			
 
				+    all_years = set([year for year in all_years if int(year) > 1989])  # noqa: PLR2004
			
 
				+    for table in all_tables:
			
 
				+        df_current_table = df_empty_tables[df_empty_tables["Table"] == table]
			
 
				+        all_countries = df_current_table["Country"].unique()
			
 
				+        countries_table = ""
			
 
				+        for country in all_countries:
			
 
				+            years_country = df_current_table[df_current_table["Country"] == country][
			
 
				+                "Year"
			
 
				+            ].unique()
			
 
				+            if set(years_country) == all_years:
			
 
				+                countries_table = f"{countries_table}; {country}"
			
 
				+            else:
			
 
				+                countries_table = f"{countries_table}; {country} ({years_country})"
			
 
				+        processed_tables.append([table, countries_table])
			
 
				+
			
 
				+    if not file.parents[1].exists():
			
 
				+        file.parents[1].mkdir()
			
 
				+    if not file.parents[0].exists():
			
 
				+        file.parents[0].mkdir()
			
 
				+    df_processed_tables = pd.DataFrame(processed_tables, columns=["Table", "Countries"])
			
 
				+    df_processed_tables.to_csv(file, index=False)
			
--- a/src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_prod.py
+++ b/src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_prod.py
@@ -28,7 +28,11 @@ from .unfccc_crf_reader_core import (
 
				     get_latest_version_for_country,
			
 
				     read_crf_table,
			
 
				 )
			
 
				-from .unfccc_crf_reader_devel import save_last_row_info, save_unknown_categories_info
			
 
				+from .unfccc_crf_reader_devel import (
			
 
				+    save_empty_tables_info,
			
 
				+    save_last_row_info,
			
 
				+    save_unknown_categories_info,
			
 
				+)
			
 
				 from .util import NoCRFFilesError, all_crf_countries
			
 
				 
			
 
				 # functions:
			
@@ -158,6 +162,7 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
 
				     if read_data or re_read:
			
 
				         unknown_categories = []
			
 
				         last_row_info = []
			
 
				+        empty_tables = []
			
 
				         for table in tables:
			
 
				             # read table for all years
			
 
				             ds_table, new_unknown_categories, new_last_row_info = read_crf_table(
			
@@ -202,42 +207,51 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
 
				                 submission_type=submission_type,
			
 
				             )
			
 
				 
			
 
				-            # now convert to native PRIMAP2 format
			
 
				-            ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
			
 
				-
			
 
				-            # if individual data for emissions and removals / recovery exist combine
			
 
				-            # them
			
 
				+            # skip empty tables
			
 
				             if (
			
 
				-                ("CO2 removals" in ds_table_pm2.data_vars)
			
 
				-                and ("CO2 emissions" in ds_table_pm2.data_vars)
			
 
				-                and "CO2" not in ds_table_pm2.data_vars
			
 
				+                not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
			
 
				+                .isna()
			
 
				+                .all(axis=None)
			
 
				             ):
			
 
				-                # we can just sum to CO2 as we made sure that it doesn't exist.
			
 
				-                # If we have CO2 and removals but not emissions, CO2 already has
			
 
				-                # removals subtracted and we do nothing here
			
 
				-                ds_table_pm2["CO2"] = ds_table_pm2[
			
 
				-                    ["CO2 emissions", "CO2 removals"]
			
 
				-                ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				-                ds_table_pm2["CO2"].attrs["entity"] = "CO2"
			
 
				-
			
 
				-            if (
			
 
				-                ("CH4 removals" in ds_table_pm2.data_vars)
			
 
				-                and ("CH4 emissions" in ds_table_pm2.data_vars)
			
 
				-                and "CH4" not in ds_table_pm2.data_vars
			
 
				-            ):
			
 
				-                # we can just sum to CH4 as we made sure that it doesn't exist.
			
 
				-                # If we have CH4 and removals but not emissions, CH4 already has
			
 
				-                # removals subtracted and we do nothing here
			
 
				-                ds_table_pm2["CH4"] = ds_table_pm2[
			
 
				-                    ["CH4 emissions", "CH4 removals"]
			
 
				-                ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				-                ds_table_pm2["CH4"].attrs["entity"] = "CH4"
			
 
				-
			
 
				-            # combine per table DS
			
 
				-            if ds_all is None:
			
 
				-                ds_all = ds_table_pm2
			
 
				+                # now convert to native PRIMAP2 format
			
 
				+                ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
			
 
				+
			
 
				+                # if individual data for emissions and removals / recovery exist combine
			
 
				+                # them
			
 
				+                if (
			
 
				+                    ("CO2 removals" in ds_table_pm2.data_vars)
			
 
				+                    and ("CO2 emissions" in ds_table_pm2.data_vars)
			
 
				+                    and "CO2" not in ds_table_pm2.data_vars
			
 
				+                ):
			
 
				+                    # we can just sum to CO2 as we made sure that it doesn't exist.
			
 
				+                    # If we have CO2 and removals but not emissions, CO2 already has
			
 
				+                    # removals subtracted and we do nothing here
			
 
				+                    ds_table_pm2["CO2"] = ds_table_pm2[
			
 
				+                        ["CO2 emissions", "CO2 removals"]
			
 
				+                    ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				+                    ds_table_pm2["CO2"].attrs["entity"] = "CO2"
			
 
				+
			
 
				+                if (
			
 
				+                    ("CH4 removals" in ds_table_pm2.data_vars)
			
 
				+                    and ("CH4 emissions" in ds_table_pm2.data_vars)
			
 
				+                    and "CH4" not in ds_table_pm2.data_vars
			
 
				+                ):
			
 
				+                    # we can just sum to CH4 as we made sure that it doesn't exist.
			
 
				+                    # If we have CH4 and removals but not emissions, CH4 already has
			
 
				+                    # removals subtracted and we do nothing here
			
 
				+                    ds_table_pm2["CH4"] = ds_table_pm2[
			
 
				+                        ["CH4 emissions", "CH4 removals"]
			
 
				+                    ].pr.sum(dim="entity", skipna=True, min_count=1)
			
 
				+                    ds_table_pm2["CH4"].attrs["entity"] = "CH4"
			
 
				+
			
 
				+                # combine per table DS
			
 
				+                if ds_all is None:
			
 
				+                    ds_all = ds_table_pm2
			
 
				+                else:
			
 
				+                    ds_all = ds_all.combine_first(ds_table_pm2)
			
 
				             else:
			
 
				-                ds_all = ds_all.combine_first(ds_table_pm2)
			
 
				+                # log that table is empty
			
 
				+                empty_tables.append(table)
			
 
				 
			
 
				         # check if there were log messages.
			
 
				         save_data = True
			
@@ -269,6 +283,19 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
 
				             )
			
 
				             save_last_row_info(last_row_info, log_location)
			
 
				 
			
 
				+        if len(empty_tables) > 0:
			
 
				+            today = date.today()
			
 
				+            log_location = (
			
 
				+                log_path
			
 
				+                / f"{submission_type}{submission_year}"
			
 
				+                / f"{country_code}_empty_tables_{today.strftime('%Y-%m-%d')}.csv"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Empty tables found for {country_code}: "
			
 
				+                f"{empty_tables}. Save log to {log_location}"
			
 
				+            )
			
 
				+            save_empty_tables_info(empty_tables, log_location)
			
 
				+
			
 
				         if save_data:
			
 
				             compression = dict(zlib=True, complevel=9)
			
 
				             output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")