Procházet zdrojové kódy

CRT1 specs for countries which have submitted so far (with a few country specific things not implemented yet)

Johannes Gütschow před 2 měsíci
rodič
revize
6a2aba899a

+ 2 - 0
src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/__init__.py

@@ -7,6 +7,7 @@ from .crf2022_specification import CRF2022
 from .crf2023_aus_specification import CRF2023_AUS
 from .crf2023_specification import CRF2023
 from .crf2024_specification import CRF2024
+from .crt1_pry_specification import CRT1_PRY
 from .crt1_specification import CRT1
 from .crt1_tun_specification import CRT1_TUN
 
@@ -17,5 +18,6 @@ __all__ = [
     "CRF2023_AUS",
     "CRF2024",
     "CRT1",
+    "CRT1_PRY",
     "CRT1_TUN",
 ]

+ 20 - 0
src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/crt1_chl_specification.py

@@ -0,0 +1,20 @@
+"""
+
+CRT1 specification for Chile.
+
+Header in Table3.B(b) differs. This is a quick fix. In the future the column matching
+should be improved to allow for different column names at least for ignored columns
+
+"""
+
+from copy import deepcopy
+
+from .crt1_specification import CRT1
+
+gwp_to_use = "AR5GWP100"
+
+CRT1_CHL = deepcopy(CRT1)
+
+CRT1_CHL["Table3.B(b)"]["table"]["cols_to_ignore"][
+    3
+] = "ACTIVITY DATA AND OTHER RELATED INFORMATION Typical animal mass (average) (kg/ animal)"

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 16 - 1151
src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/crt1_pry_specification.py


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 367 - 128
src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/crt1_specification.py


+ 34 - 3
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_core.py

@@ -32,7 +32,7 @@ pd.set_option("future.no_silent_downcasting", True)
 
 
 ### reading functions
-def convert_crf_table_to_pm2if(  # noqa: PLR0912, PLR0913
+def convert_crf_table_to_pm2if(  # noqa: PLR0912, PLR0913, PLR0915
     df_table: pd.DataFrame,
     submission_year: int,
     entity_mapping: dict[str, str] | None = None,
@@ -42,6 +42,7 @@ def convert_crf_table_to_pm2if(  # noqa: PLR0912, PLR0913
     meta_data_input: dict[str, str] | None = None,
     submission_type: str = "CRF",
     decimal_sep: str = ".",
+    thousands_sep: str = ",",
 ) -> pd.DataFrame:
     """
     Convert a given pandas long format crf table to PRIMAP2 interchange format
@@ -69,6 +70,8 @@ def convert_crf_table_to_pm2if(  # noqa: PLR0912, PLR0913
         read CRF or CRF data
     decimal_sep: str default = '.'
         decimal separator to use to interpret the data.
+    thousands_sep: str default = ','
+        thousands separator to use to interpret the data.
 
     Returns
     -------
@@ -178,6 +181,25 @@ def convert_crf_table_to_pm2if(  # noqa: PLR0912, PLR0913
             meta_data[key] = meta_data_input[key]
 
     # fix decimal separator
+    sep_regexp_special = ["."]
+
+    if decimal_sep != ".":
+        if thousands_sep in sep_regexp_special:
+            regex_thousands = f"([0-9]+)\\{thousands_sep}([0-9,]+)"
+        else:
+            regex_thousands = f"([0-9]+){thousands_sep}([0-9,]+)"
+        if decimal_sep in sep_regexp_special:
+            regex_decimal = f"([0-9]+)\\{decimal_sep}([0-9]+)"
+        else:
+            regex_decimal = f"([0-9]+){decimal_sep}([0-9]+)"
+        # first remove thousand sep
+        df_table = df_table.replace(
+            to_replace=regex_thousands, value=r"\1\2", regex=True
+        )
+        # now replace the decimal sep by a dot
+        df_table = df_table.replace(
+            to_replace=regex_decimal, value=r"\1.\2", regex=True
+        )
 
     df_table_if = pm2.pm2io.convert_long_dataframe_if(
         df_table,
@@ -204,7 +226,7 @@ def read_crf_table(  # noqa: PLR0913, PLR0912, PLR0915
     folder: str | None = None,
     submission_type: str = "CRF",
     debug: bool = False,
-) -> tuple[pd.DataFrame, list[list], list[list]]:
+) -> tuple[pd.DataFrame, list[list], list[list], bool]:
     """
     Read CRF table for given year and country/countries
 
@@ -246,6 +268,7 @@ def read_crf_table(  # noqa: PLR0913, PLR0912, PLR0915
         * Third return parameter holds information on data found in the last read row.
           This is used as a hint to check if table specifications might have to
           be adapted as country submitted tables are longer than expected.
+        * The fourth return parameter is true if the worksheet to read in the file
 
     """
     # check type
@@ -342,6 +365,7 @@ def read_crf_table(  # noqa: PLR0913, PLR0912, PLR0915
     df_all = None
     unknown_rows = []
     last_row_info = []
+    not_present = False
     for file in input_files:
         file_info = get_info_from_crf_filename(file.name)
         try:
@@ -359,10 +383,17 @@ def read_crf_table(  # noqa: PLR0913, PLR0912, PLR0915
                 df_all = pd.concat([df_this_file, df_all])
                 unknown_rows = unknown_rows + unknown_rows_this_file
                 last_row_info = last_row_info + last_row_info_this_file
+        except ValueError as e:
+            if e.args[0] == f"Worksheet named '{table}' not found":
+                print(f"Table {table} not present")
+                not_present = True
+                pass
+            else:
+                print(f"Error when reading file {file}. Skipping file. Exception: {e}")
         except Exception as e:
             print(f"Error when reading file {file}. Skipping file. Exception: {e}")
 
-    return df_all, unknown_rows, last_row_info
+    return df_all, unknown_rows, last_row_info, not_present
 
 
 def read_crf_table_from_file(  # noqa: PLR0912, PLR0915

+ 114 - 68
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_devel.py

@@ -75,6 +75,7 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
     unknown_categories = []
     last_row_info = []
     empty_tables = []
+    missing_worksheets = []
     ds_all = None
     print(
         f"{submission_type} test reading for {submission_type}{submission_year}. "
@@ -173,6 +174,7 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
                         ds_table,
                         new_unknown_categories,
                         new_last_row_info,
+                        not_present,
                     ) = read_crf_table(
                         current_country_code,
                         table,
@@ -187,79 +189,108 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
                     unknown_categories = unknown_categories + new_unknown_categories
                     last_row_info = last_row_info + new_last_row_info
 
-                    # convert to PRIMAP2 IF
-                    # first drop the orig_cat_name col as it can have multiple values
-                    # for one category
-                    ds_table = ds_table.drop(columns=["orig_cat_name"])
-
-                    # TODO: catch entity conversion errors and make list of error
-                    #  entities
-                    # if we need to map entities pass this info to the conversion
-                    # function
-                    if "entity_mapping" in crf_spec[table]:
-                        entity_mapping = crf_spec[table]["entity_mapping"]
-                    else:
-                        entity_mapping = None
-
-                    ds_table_if = convert_crf_table_to_pm2if(
-                        ds_table,
-                        submission_year,
-                        meta_data_input={
-                            "title": f"Data submitted in {submission_year} to the "
-                            f"UNFCCC in the {type_name} ({submission_type}) "
-                            f"by {country_name}. "
-                            f"Submission date / version: {date_or_version}"
-                        },
-                        entity_mapping=entity_mapping,
-                        submission_type=submission_type,
-                    )
-
-                    # skip empty tables
-                    if (
-                        not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
-                        .isna()
-                        .all(axis=None)
-                    ):
-                        # now convert to native PRIMAP2 format
-                        ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
-
-                        # if individual data for emissions and removals / recovery exist
-                        # combine them
-                        if (
-                            ("CO2 removals" in ds_table_pm2.data_vars)
-                            and ("CO2 emissions" in ds_table_pm2.data_vars)
-                            and "CO2" not in ds_table_pm2.data_vars
-                        ):
-                            # we can just sum to CO2 as we made sure that it doesn't
-                            # exist.
-                            # If we have CO2 and removals but not emissions, CO2 already
-                            # has removals subtracted and we do nothing here
-                            ds_table_pm2["CO2"] = ds_table_pm2[
-                                ["CO2 emissions", "CO2 removals"]
-                            ].pr.sum(dim="entity", skipna=True, min_count=1)
-                            ds_table_pm2["CO2"].attrs["entity"] = "CO2"
+                    if ds_table is not None:
+                        # convert to PRIMAP2 IF
+                        # first drop the orig_cat_name col as it can have multiple
+                        # values for one category
+                        ds_table = ds_table.drop(columns=["orig_cat_name"])
+
+                        # TODO: catch entity conversion errors and make list of error
+                        #  entities
+                        # if we need to map entities pass this info to the conversion
+                        # function
+                        if "entity_mapping" in crf_spec[table]:
+                            entity_mapping = crf_spec[table]["entity_mapping"]
+                        else:
+                            entity_mapping = None
 
+                        if "decimal_sep" in crf_spec[table]["table"]:
+                            decimal_sep = crf_spec[table]["table"]["decimal_sep"]
+                        else:
+                            decimal_sep = "."
+                        if "thousands_sep" in crf_spec[table]["table"]:
+                            thousands_sep = crf_spec[table]["table"]["thousands_sep"]
+                        else:
+                            thousands_sep = ","
+
+                        ds_table_if = convert_crf_table_to_pm2if(
+                            ds_table,
+                            submission_year,
+                            meta_data_input={
+                                "title": f"Data submitted in {submission_year} to the "
+                                f"UNFCCC in the {type_name} ({submission_type}) "
+                                f"by {country_name}. "
+                                f"Submission date / version: {date_or_version}"
+                            },
+                            entity_mapping=entity_mapping,
+                            submission_type=submission_type,
+                            decimal_sep=decimal_sep,
+                            thousands_sep=thousands_sep,
+                        )
+
+                        # skip empty tables
                         if (
-                            ("CH4 removals" in ds_table_pm2.data_vars)
-                            and ("CH4 emissions" in ds_table_pm2.data_vars)
-                            and "CH4" not in ds_table_pm2.data_vars
+                            not ds_table_if.set_index(
+                                ds_table_if.attrs["dimensions"]["*"]
+                            )
+                            .isna()
+                            .all(axis=None)
                         ):
-                            # we can just sum to CH4 as we made sure that it doesn't
-                            # exist.
-                            # If we have CH4 and removals but not emissions, CH4 already
-                            # has removals subtracted and we do nothing here
-                            ds_table_pm2["CH4"] = ds_table_pm2[
-                                ["CH4 emissions", "CH4 removals"]
-                            ].pr.sum(dim="entity", skipna=True, min_count=1)
-                            ds_table_pm2["CH4"].attrs["entity"] = "CH4"
-
-                        # combine per table DS
-                        if ds_all is None:
-                            ds_all = ds_table_pm2
+                            # now convert to native PRIMAP2 format
+                            ds_table_pm2 = pm2.pm2io.from_interchange_format(
+                                ds_table_if
+                            )
+
+                            # if individual data for emissions and removals /
+                            # recovery exist combine them
+                            if (
+                                ("CO2 removals" in ds_table_pm2.data_vars)
+                                and ("CO2 emissions" in ds_table_pm2.data_vars)
+                                and "CO2" not in ds_table_pm2.data_vars
+                            ):
+                                # we can just sum to CO2 as we made sure that it doesn't
+                                # exist.
+                                # If we have CO2 and removals but not emissions,
+                                # CO2 already
+                                # has removals subtracted and we do nothing here
+                                ds_table_pm2["CO2"] = ds_table_pm2[
+                                    ["CO2 emissions", "CO2 removals"]
+                                ].pr.sum(dim="entity", skipna=True, min_count=1)
+                                ds_table_pm2["CO2"].attrs["entity"] = "CO2"
+
+                            if (
+                                ("CH4 removals" in ds_table_pm2.data_vars)
+                                and ("CH4 emissions" in ds_table_pm2.data_vars)
+                                and "CH4" not in ds_table_pm2.data_vars
+                            ):
+                                # we can just sum to CH4 as we made sure that it doesn't
+                                # exist.
+                                # If we have CH4 and removals but not emissions, CH4
+                                # already has removals subtracted and we do nothing here
+                                ds_table_pm2["CH4"] = ds_table_pm2[
+                                    ["CH4 emissions", "CH4 removals"]
+                                ].pr.sum(dim="entity", skipna=True, min_count=1)
+                                ds_table_pm2["CH4"].attrs["entity"] = "CH4"
+
+                            # combine per table DS
+                            if ds_all is None:
+                                ds_all = ds_table_pm2
+                            else:
+                                ds_all = ds_all.combine_first(ds_table_pm2)
                         else:
-                            ds_all = ds_all.combine_first(ds_table_pm2)
+                            empty_tables.append(
+                                [table, current_country_code, data_year]
+                            )
+                    elif not_present:
+                        # log that table is not present
+                        missing_worksheets.append(
+                            [table, current_country_code, data_year]
+                        )
                     else:
-                        empty_tables.append([table, current_country_code, data_year])
+                        print(
+                            f"Empty DataFrame returned for table {table}, "
+                            f"country {country_code}. Check log for errors."
+                        )
                 except Exception as e:
                     message = (
                         f"Error occurred when converting table {table} for"
@@ -317,6 +348,21 @@ def read_year_to_test_specs(  # noqa: PLR0912, PLR0915
         print(f"Empty tables found:. Save log to {log_location}")
         save_empty_tables_info(empty_tables, log_location)
 
+    if len(missing_worksheets) > 0:
+        today = date.today()
+        if country_code is not None:
+            log_location = (
+                output_folder / f"{data_year}_missing_tables_{country_code}_"
+                f"{today.strftime('%Y-%m-%d')}.csv"
+            )
+        else:
+            log_location = (
+                output_folder / f"{data_year}_missing_tables_"
+                f"{today.strftime('%Y-%m-%d')}.csv"
+            )
+        print(f"Missing worksheets. Save log to {log_location}")
+        save_empty_tables_info(missing_worksheets, log_location)
+
     # write exceptions
     f_ex = open(
         output_folder / f"{data_year}_exceptions_{today.strftime('%Y-%m-%d')}.txt", "w"

+ 111 - 70
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_prod.py

@@ -163,9 +163,15 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
         unknown_categories = []
         last_row_info = []
         empty_tables = []
+        missing_worksheets = []
         for table in tables:
             # read table for all years
-            ds_table, new_unknown_categories, new_last_row_info = read_crf_table(
+            (
+                ds_table,
+                new_unknown_categories,
+                new_last_row_info,
+                not_present,
+            ) = read_crf_table(
                 country_code,
                 table,
                 submission_year,
@@ -177,81 +183,103 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
             unknown_categories = unknown_categories + new_unknown_categories
             last_row_info = last_row_info + new_last_row_info
 
-            # convert to PRIMAP2 IF
-            # first drop the orig_cat_name col as it can have multiple values for
-            # one category
-            ds_table = ds_table.drop(columns=["orig_cat_name"])
+            if ds_table is not None:
+                # convert to PRIMAP2 IF
+                # first drop the orig_cat_name col as it can have multiple values for
+                # one category
+                ds_table = ds_table.drop(columns=["orig_cat_name"])
 
-            # if we need to map entities pass this info to the conversion function
-            if "entity_mapping" in crf_spec[table]:
-                entity_mapping = crf_spec[table]["entity_mapping"]
-            else:
-                entity_mapping = None
-            if submission_type == "CRF":
-                meta_data_input = {
-                    "title": f"CRF data submitted in {submission_year} to the UNFCCC "
-                    f"in the {type_name} ({submission_type}) by {country_name}. "
-                    f"Submission date: {date_or_version}"
-                }
-            else:
-                meta_data_input = {
-                    "title": f"Data submitted for round {submission_year} "
-                    f"to the UNFCCC in the {type_name} ({submission_type}) by "
-                    f"{country_name}. Submission version: {date_or_version}"
-                }
-            ds_table_if = convert_crf_table_to_pm2if(
-                ds_table,
-                submission_year,
-                meta_data_input=meta_data_input,
-                entity_mapping=entity_mapping,
-                submission_type=submission_type,
-            )
-
-            # skip empty tables
-            if (
-                not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
-                .isna()
-                .all(axis=None)
-            ):
-                # now convert to native PRIMAP2 format
-                ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
-
-                # if individual data for emissions and removals / recovery exist combine
-                # them
-                if (
-                    ("CO2 removals" in ds_table_pm2.data_vars)
-                    and ("CO2 emissions" in ds_table_pm2.data_vars)
-                    and "CO2" not in ds_table_pm2.data_vars
-                ):
-                    # we can just sum to CO2 as we made sure that it doesn't exist.
-                    # If we have CO2 and removals but not emissions, CO2 already has
-                    # removals subtracted and we do nothing here
-                    ds_table_pm2["CO2"] = ds_table_pm2[
-                        ["CO2 emissions", "CO2 removals"]
-                    ].pr.sum(dim="entity", skipna=True, min_count=1)
-                    ds_table_pm2["CO2"].attrs["entity"] = "CO2"
+                # if we need to map entities pass this info to the conversion function
+                if "entity_mapping" in crf_spec[table]:
+                    entity_mapping = crf_spec[table]["entity_mapping"]
+                else:
+                    entity_mapping = None
+                if submission_type == "CRF":
+                    meta_data_input = {
+                        "title": f"CRF data submitted in {submission_year} to the "
+                        f"UNFCCC in the {type_name} ({submission_type}) by "
+                        f"{country_name}. "
+                        f"Submission date: {date_or_version}"
+                    }
+                else:
+                    meta_data_input = {
+                        "title": f"Data submitted for round {submission_year} "
+                        f"to the UNFCCC in the {type_name} ({submission_type}) by "
+                        f"{country_name}. Submission version: {date_or_version}"
+                    }
+
+                if "decimal_sep" in crf_spec[table]["table"]:
+                    decimal_sep = crf_spec[table]["table"]["decimal_sep"]
+                else:
+                    decimal_sep = "."
+                if "thousands_sep" in crf_spec[table]["table"]:
+                    thousands_sep = crf_spec[table]["table"]["thousands_sep"]
+                else:
+                    thousands_sep = ","
+
+                ds_table_if = convert_crf_table_to_pm2if(
+                    ds_table,
+                    submission_year,
+                    meta_data_input=meta_data_input,
+                    entity_mapping=entity_mapping,
+                    submission_type=submission_type,
+                    decimal_sep=decimal_sep,
+                    thousands_sep=thousands_sep,
+                )
 
+                # skip empty tables
                 if (
-                    ("CH4 removals" in ds_table_pm2.data_vars)
-                    and ("CH4 emissions" in ds_table_pm2.data_vars)
-                    and "CH4" not in ds_table_pm2.data_vars
+                    not ds_table_if.set_index(ds_table_if.attrs["dimensions"]["*"])
+                    .isna()
+                    .all(axis=None)
                 ):
-                    # we can just sum to CH4 as we made sure that it doesn't exist.
-                    # If we have CH4 and removals but not emissions, CH4 already has
-                    # removals subtracted and we do nothing here
-                    ds_table_pm2["CH4"] = ds_table_pm2[
-                        ["CH4 emissions", "CH4 removals"]
-                    ].pr.sum(dim="entity", skipna=True, min_count=1)
-                    ds_table_pm2["CH4"].attrs["entity"] = "CH4"
-
-                # combine per table DS
-                if ds_all is None:
-                    ds_all = ds_table_pm2
+                    # now convert to native PRIMAP2 format
+                    ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
+
+                    # if individual data for emissions and removals / recovery exist
+                    # combine them
+                    if (
+                        ("CO2 removals" in ds_table_pm2.data_vars)
+                        and ("CO2 emissions" in ds_table_pm2.data_vars)
+                        and "CO2" not in ds_table_pm2.data_vars
+                    ):
+                        # we can just sum to CO2 as we made sure that it doesn't exist.
+                        # If we have CO2 and removals but not emissions, CO2 already has
+                        # removals subtracted and we do nothing here
+                        ds_table_pm2["CO2"] = ds_table_pm2[
+                            ["CO2 emissions", "CO2 removals"]
+                        ].pr.sum(dim="entity", skipna=True, min_count=1)
+                        ds_table_pm2["CO2"].attrs["entity"] = "CO2"
+
+                    if (
+                        ("CH4 removals" in ds_table_pm2.data_vars)
+                        and ("CH4 emissions" in ds_table_pm2.data_vars)
+                        and "CH4" not in ds_table_pm2.data_vars
+                    ):
+                        # we can just sum to CH4 as we made sure that it doesn't exist.
+                        # If we have CH4 and removals but not emissions, CH4 already has
+                        # removals subtracted and we do nothing here
+                        ds_table_pm2["CH4"] = ds_table_pm2[
+                            ["CH4 emissions", "CH4 removals"]
+                        ].pr.sum(dim="entity", skipna=True, min_count=1)
+                        ds_table_pm2["CH4"].attrs["entity"] = "CH4"
+
+                    # combine per table DS
+                    if ds_all is None:
+                        ds_all = ds_table_pm2
+                    else:
+                        ds_all = ds_all.combine_first(ds_table_pm2)
                 else:
-                    ds_all = ds_all.combine_first(ds_table_pm2)
+                    # log that table is empty
+                    empty_tables.append([table, country_code, ""])
+            elif not_present:
+                # log that table is not present
+                missing_worksheets.append([table, country_code, ""])
             else:
-                # log that table is empty
-                empty_tables.append(table)
+                print(
+                    f"Empty DataFrame returned for table {table}, "
+                    f"country {country_code}. Check log for errors."
+                )
 
         # check if there were log messages.
         save_data = True
@@ -296,6 +324,19 @@ def read_crf_for_country(  # noqa: PLR0912, PLR0915
             )
             save_empty_tables_info(empty_tables, log_location)
 
+        if len(missing_worksheets) > 0:
+            today = date.today()
+            log_location = (
+                log_path
+                / f"{submission_type}{submission_year}"
+                / f"{country_code}_missing_tables_{today.strftime('%Y-%m-%d')}.csv"
+            )
+            print(
+                f"Missing worksheets for {country_code}: "
+                f"{empty_tables}. Save log to {log_location}"
+            )
+            save_empty_tables_info(missing_worksheets, log_location)
+
         if save_data:
             compression = dict(zlib=True, complevel=9)
             output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")

Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů