пре 3 месеци · 76f9e52bd7
--- a/src/faostat_data_primap/helper/definitions.py
+++ b/src/faostat_data_primap/helper/definitions.py
@@ -717,7 +717,7 @@ country_to_iso3_mapping = {
 
				     "Yugoslav SFR": "F248",
			
 
				     "Zambia": "ZMB",
			
 
				     "Zimbabwe": "ZWE",
			
 
				-    # reading the csv correctly doesn't work for some domains
			
 
				+    # reading the special characters (é, ô, ü etc.) fails for some domains
			
 
				     # todo there is probably a better way to solve this
			
 
				     "CÃ´te d'Ivoire" : "CIV",
			
 
				     "CuraÃ§ao" : "CUW",
			
--- a/src/faostat_data_primap/read.py
+++ b/src/faostat_data_primap/read.py
@@ -2,124 +2,13 @@
 
				 
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-import pycountry
			
 
				 
			
 
				 from src.faostat_data_primap.helper.definitions import (
			
 
				+    country_to_iso3_mapping,
			
 
				     downloaded_data_path,
			
 
				     read_config_all,
			
 
				-country_to_iso3_mapping,
			
 
				 )
			
 
				 
			
 
				-# mapping = pd.read_csv("../../FAOSTAT_data_11-19-2024.csv")#, encoding="ISO-8859-1")
			
 
				-# mapping_dict = {}
			
 
				-# for idx, row in mapping.iterrows():
			
 
				-#     mapping_dict[row['Country']] = row['ISO3 Code']
			
 
				-
			
 
				-
			
 
				-# custom_country_mapping_code = {}
			
 
				-# custom_country_mapping_name = {
			
 
				-#     # farm gate agricultur energy
			
 
				-#     "Bolivia (Plurinational State of)": "BOL",
			
 
				-#     "China, Hong Kong SAR": "HKG",
			
 
				-#     "China, Macao SAR": "MAC",
			
 
				-#     "China, mainland": "CHN",
			
 
				-#     "China, Taiwan Province of": "TWN",
			
 
				-#     "Iran (Islamic Republic of)": "IRN",
			
 
				-#     "Czechoslovakia": "CSK",
			
 
				-#     "Ethiopia PDR": "ETH",
			
 
				-#     "Netherlands (Kingdom of the)": "NLD",
			
 
				-#     "Netherlands Antilles (former)": "ANT",
			
 
				-#     # todo is former Sudan same as the new (north) Sudan
			
 
				-#     "Sudan (former)": "SDN",
			
 
				-#     "USSR": "SUN",
			
 
				-#     "Venezuela (Bolivarian Republic of)": "VEN",
			
 
				-#     "Yugoslav SFR": "YUG",
			
 
				-#     "World": "EARTH",
			
 
				-#     # todo Andrews cement list below (deleted commented lines)
			
 
				-#     # "Bonaire, Saint Eustatius and Saba": "BES",
			
 
				-#     # "Cape Verde": "CPV",
			
 
				-#     "Democratic Republic of the Congo": "COD",
			
 
				-#     # "Faeroe Islands": "FRO",
			
 
				-#     "Micronesia (Federated States of)": "FSM",
			
 
				-#     # "Iran": "IRN",
			
 
				-#     # "Laos": "LAO",
			
 
				-#     # "Occupied Palestinian Territory": "PSE",
			
 
				-#     # "Swaziland": "SWZ",
			
 
				-#     # "Taiwan": "TWN",
			
 
				-#     "Wallis and Futuna Islands": "WLF",
			
 
				-#     # farm gate emissions crops
			
 
				-#     "United States Virgin Islands": "VIR",
			
 
				-#     # todo is this relevant to us?
			
 
				-#     "Pacific Islands Trust Territory": "PIC",
			
 
				-#     "Svalbard and Jan Mayen Islands": "SJM",  # Norwy
			
 
				-#     # something goes wrong with french characters in land_use_forest
			
 
				-#     "CÃ´te d'Ivoire": "CIV",
			
 
				-#     "CuraÃ§ao": "CUW",
			
 
				-#     "RÃ©union": "REU",
			
 
				-#     "TÃ¼rkiye": "TUR",
			
 
				-#     # pycountry mixes up these
			
 
				-#     'Niger' : 'NER',
			
 
				-#     'Nigeria' : 'NGA',
			
 
				-#     "Curaçao" : "CUW",
			
 
				-#     "Republic of Korea" : 'KOR',
			
 
				-#     "Democratic People's Republic of Korea" : "PRK",
			
 
				-# }
			
 
				-
			
 
				-#
			
 
				-# def get_country_code(
			
 
				-#     country_name: str,
			
 
				-# ) -> str:
			
 
				-#     """
			
 
				-#     Get country code for country name.
			
 
				-#
			
 
				-#     If the input is a code it will be returned,
			
 
				-#     if the input is not a three-letter code a search will be performed
			
 
				-#
			
 
				-#     Parameters
			
 
				-#     ----------
			
 
				-#     country_name: str
			
 
				-#         Country code or name to get the three-letter code for.
			
 
				-#
			
 
				-#     Returns
			
 
				-#     -------
			
 
				-#         country_code: str
			
 
				-#
			
 
				-#     """
			
 
				-#     # First check if it's in the list of custom codes
			
 
				-#     if country_name in custom_country_mapping_code:
			
 
				-#         country_code = country_name
			
 
				-#     elif country_name in custom_country_mapping_name:
			
 
				-#         country_code = custom_country_mapping_name[country_name]
			
 
				-#     else:
			
 
				-#         try:
			
 
				-#             # check if it's a 3 letter UNFCCC_GHG_data
			
 
				-#             country = pycountry.countries.get(alpha_3=country_name)
			
 
				-#             country_code = country.alpha_3
			
 
				-#         except:
			
 
				-#             try:
			
 
				-#                 country = pycountry.countries.search_fuzzy(
			
 
				-#                     country_name.replace("_", " ")
			
 
				-#                 )
			
 
				-#             except:
			
 
				-#                 msg = f"Cannot map country {country_name} to country code."
			
 
				-#                 raise ValueError(msg)
			
 
				-#             if len(country) > 1:
			
 
				-#                 country_code = None
			
 
				-#                 for current_country in country:
			
 
				-#                     if current_country.name == country_name:
			
 
				-#                         country_code = current_country.alpha_3
			
 
				-#                 if country_code is None:
			
 
				-#                     msg = (
			
 
				-#                         f"Country name {country_name} has {len(country)} "
			
 
				-#                         "possible results for country codes."
			
 
				-#                     )
			
 
				-#                     raise ValueError(msg)
			
 
				-#
			
 
				-#             country_code = country[0].alpha_3
			
 
				-#
			
 
				-#     return country_code
			
 
				-
			
 
				-
			
 
				 files_to_read = (
			
 
				     (
			
 
				         "farm_gate_agriculture_energy",
			
@@ -160,7 +49,7 @@ for domain, release in reversed(files_to_read):
 
				     print(f"Read {read_config["filename"]}")
			
 
				     dataset_path = downloaded_data_path / domain / release / read_config["filename"]
			
 
				     # There are some non-utf8 characters in Emissions_Drained_Organic_Soils_E_All_Data_NOFLAG.csv
			
 
				-    df_domain = pd.read_csv(dataset_path, encoding='ISO-8859-1')
			
 
				+    df_domain = pd.read_csv(dataset_path, encoding="ISO-8859-1")
			
 
				 
			
 
				     # remove rows by unit
			
 
				     # todo this is maybe not a good idea as it hides the elements to be removed
			
@@ -177,43 +66,13 @@ for domain, release in reversed(files_to_read):
 
				     if "areas_to_remove" in read_config.keys():
			
 
				         df_domain = df_domain[~df_domain["Area"].isin(read_config["areas_to_remove"])]
			
 
				 
			
 
				-    # check for duplicates (same data, different country name)
			
 
				-    # duplicates = df_domain.copy().drop(labels=["Area", "Area Code (M49)", "Area Code"], axis=1)
			
 
				-    # duplicates = duplicates[duplicates.duplicated(keep=False)]
			
 
				-    # if not duplicates.empty:
			
 
				-    #     msg = f"Duplicate values for {domain}"
			
 
				-    #     raise ValueError(msg)
			
 
				-
			
 
				-    # country name to ISO3 country code mapping
			
 
				-    # countries_to_map = [
			
 
				-    #     c for c in df_domain["Area"].unique() if c not in country_mapping.keys()
			
 
				-    # ]
			
 
				-    # for country_to_map in countries_to_map:
			
 
				-    #     country_mapping[country_to_map] = get_country_code(country_to_map)
			
 
				-
			
 
				-    # make sure we don't map duplicate country codes
			
 
				-    # if len(country_mapping.values()) != len(set(country_mapping.values())):
			
 
				-    #     duplicate_codes = [x for i, x in enumerate(list(country_mapping.values())) if list(country_mapping.values()).count(x) > 1]
			
 
				-    #     duplicates = [(key, value) for (key, value) in country_mapping.items() if value in duplicate_codes]
			
 
				-    #     msg = f"Duplicate country codes for {domain}. Check country_mapping"
			
 
				-    #     raise ValueError(msg)
			
 
				-
			
 
				-
			
 
				-
			
 
				     # create country columns
			
 
				     df_domain["country (ISO3)"] = df_domain["Area"].map(country_to_iso3_mapping)
			
 
				 
			
 
				     # check all countries are converted into iso3 codes
			
 
				-    if any(df_domain['country (ISO3)'].isna()):
			
 
				+    if any(df_domain["country (ISO3)"].isna()):
			
 
				         raise ValueError
			
 
				 
			
 
				-    # check for duplicates (same data, different country name)
			
 
				-    # duplicates = df_domain.copy().drop(labels=["Area", "Area Code (M49)", "Area Code"], axis=1)
			
 
				-    # duplicates = duplicates[duplicates.duplicated(keep=False)]
			
 
				-    # if not duplicates.empty:
			
 
				-    #     msg = f"Duplicate values for {domain}. Check country {duplicates['country (ISO3)'].unique()}"
			
 
				-    #     raise ValueError(msg)
			
 
				-
			
 
				     # create entity column
			
 
				     df_domain["entity"] = df_domain["Element"].map(read_config["entity_mapping"])
			
 
				 
			
@@ -244,14 +103,17 @@ for domain, release in reversed(files_to_read):
 
				             join="outer",
			
 
				         ).reset_index(drop=True)
			
 
				 
			
 
				+# df_all = df_all.drop(labels=["Source"], axis=1)
			
 
				+df_all["Source"] = df_all["Source"].fillna("unknown")
			
 
				 coords_cols = {
			
 
				     "area": "country (ISO3)",
			
 
				     "unit": "Unit",
			
 
				     "entity": "entity",
			
 
				     "source": "Source",
			
 
				+    "category" : "category"
			
 
				 }
			
 
				 
			
 
				-coords_terminologies = {"area": "ISO3", "category": "FAOSTAT"}
			
 
				+coords_terminologies = {"area": "ISO3", "category": "FAOSTAT", "scenario": "FAO"}
			
 
				 
			
 
				 coords_defaults = {
			
 
				     # "source": "FAO",
			
@@ -262,15 +124,21 @@ coords_value_mapping = {}
 
				 filter_keep = {}
			
 
				 filter_remove = {}
			
 
				 meta_data = {
			
 
				-    "references": "tbd",
			
 
				-    "rights": "tbd",
			
 
				-    "contact": "tbd",
			
 
				-    "title": "tbd",
			
 
				-    "comment": "tbd",
			
 
				-    "institution": "tbd",
			
 
				+    "references": "https://www.fao.org/faostat",
			
 
				+    "rights": "Creative Commons Attribution-4.0 International licence (CC BY 4.0)",
			
 
				+    "contact": "daniel.busch@climate-resource.com",
			
 
				+    "title": "Agrifood systems emissions",
			
 
				+    "comment": (
			
 
				+        "Published by Food and Agriculture Organization of the "
			
 
				+        "United Nations (FAO), converted to PRIMAP2 format by "
			
 
				+        "Daniel Busch"
			
 
				+    ),
			
 
				+    "institution": ("Food and Agriculture Organization of the United Nations"),
			
 
				 }
			
 
				 # Rename columns to remove the "Y" prefix
			
 
				-df_all.rename(columns=lambda x: x.lstrip('Y') if x.startswith('Y') else x, inplace=True)
			
 
				+df_all = df_all.rename(columns=lambda x: x.lstrip("Y") if x.startswith("Y") else x)
			
 
				+df_all[df_all["entity"].isin(['FGASES (AR5GWP100)', 'KYOTOGHG (AR5GWP100)'])]["unit"]
			
 
				+
			
 
				 
			
 
				 data_if = pm2.pm2io.convert_wide_dataframe_if(
			
 
				     df_all,
			
@@ -287,11 +155,4 @@ data_if = pm2.pm2io.convert_wide_dataframe_if(
 
				 data_pm2 = pm2.pm2io.from_interchange_format(data_if, data_if.attrs)
			
 
				 
			
 
				 # convert back to IF for standardized units
			
 
				-data_if = data_pm2.pr.to_interchange_format()
			
 
				-
			
 
				-pass
			
 
				-# steps:
			
 
				-# convert to primap2 format
			
 
				-# save raw data set
			
 
				-# convert categories to IPCC2006_PRIMAP standard
			
 
				-# save data set
			
 
				+data_if = data_pm2.pr.to_interchange_format()