Преглед изворни кода

all data in if, pm2 format still fails [skip ci]

Daniel Busch пре 3 месеци
родитељ
комит
76f9e52bd7
2 измењених фајлова са 22 додато и 161 уклоњено
  1. 1 1
      src/faostat_data_primap/helper/definitions.py
  2. 21 160
      src/faostat_data_primap/read.py

+ 1 - 1
src/faostat_data_primap/helper/definitions.py

@@ -717,7 +717,7 @@ country_to_iso3_mapping = {
     "Yugoslav SFR": "F248",
     "Zambia": "ZMB",
     "Zimbabwe": "ZWE",
-    # reading the csv correctly doesn't work for some domains
+    # reading the special characters (é, ô, ü etc.) fails for some domains
     # todo there is probably a better way to solve this
     "Côte d'Ivoire" : "CIV",
     "Curaçao" : "CUW",

+ 21 - 160
src/faostat_data_primap/read.py

@@ -2,124 +2,13 @@
 
 import pandas as pd
 import primap2 as pm2
-import pycountry
 
 from src.faostat_data_primap.helper.definitions import (
+    country_to_iso3_mapping,
     downloaded_data_path,
     read_config_all,
-country_to_iso3_mapping,
 )
 
-# mapping = pd.read_csv("../../FAOSTAT_data_11-19-2024.csv")#, encoding="ISO-8859-1")
-# mapping_dict = {}
-# for idx, row in mapping.iterrows():
-#     mapping_dict[row['Country']] = row['ISO3 Code']
-
-
-# custom_country_mapping_code = {}
-# custom_country_mapping_name = {
-#     # farm gate agricultur energy
-#     "Bolivia (Plurinational State of)": "BOL",
-#     "China, Hong Kong SAR": "HKG",
-#     "China, Macao SAR": "MAC",
-#     "China, mainland": "CHN",
-#     "China, Taiwan Province of": "TWN",
-#     "Iran (Islamic Republic of)": "IRN",
-#     "Czechoslovakia": "CSK",
-#     "Ethiopia PDR": "ETH",
-#     "Netherlands (Kingdom of the)": "NLD",
-#     "Netherlands Antilles (former)": "ANT",
-#     # todo is former Sudan same as the new (north) Sudan
-#     "Sudan (former)": "SDN",
-#     "USSR": "SUN",
-#     "Venezuela (Bolivarian Republic of)": "VEN",
-#     "Yugoslav SFR": "YUG",
-#     "World": "EARTH",
-#     # todo Andrews cement list below (deleted commented lines)
-#     # "Bonaire, Saint Eustatius and Saba": "BES",
-#     # "Cape Verde": "CPV",
-#     "Democratic Republic of the Congo": "COD",
-#     # "Faeroe Islands": "FRO",
-#     "Micronesia (Federated States of)": "FSM",
-#     # "Iran": "IRN",
-#     # "Laos": "LAO",
-#     # "Occupied Palestinian Territory": "PSE",
-#     # "Swaziland": "SWZ",
-#     # "Taiwan": "TWN",
-#     "Wallis and Futuna Islands": "WLF",
-#     # farm gate emissions crops
-#     "United States Virgin Islands": "VIR",
-#     # todo is this relevant to us?
-#     "Pacific Islands Trust Territory": "PIC",
-#     "Svalbard and Jan Mayen Islands": "SJM",  # Norwy
-#     # something goes wrong with french characters in land_use_forest
-#     "Côte d'Ivoire": "CIV",
-#     "Curaçao": "CUW",
-#     "Réunion": "REU",
-#     "Türkiye": "TUR",
-#     # pycountry mixes up these
-#     'Niger' : 'NER',
-#     'Nigeria' : 'NGA',
-#     "Curaçao" : "CUW",
-#     "Republic of Korea" : 'KOR',
-#     "Democratic People's Republic of Korea" : "PRK",
-# }
-
-#
-# def get_country_code(
-#     country_name: str,
-# ) -> str:
-#     """
-#     Get country code for country name.
-#
-#     If the input is a code it will be returned,
-#     if the input is not a three-letter code a search will be performed
-#
-#     Parameters
-#     ----------
-#     country_name: str
-#         Country code or name to get the three-letter code for.
-#
-#     Returns
-#     -------
-#         country_code: str
-#
-#     """
-#     # First check if it's in the list of custom codes
-#     if country_name in custom_country_mapping_code:
-#         country_code = country_name
-#     elif country_name in custom_country_mapping_name:
-#         country_code = custom_country_mapping_name[country_name]
-#     else:
-#         try:
-#             # check if it's a 3 letter UNFCCC_GHG_data
-#             country = pycountry.countries.get(alpha_3=country_name)
-#             country_code = country.alpha_3
-#         except:
-#             try:
-#                 country = pycountry.countries.search_fuzzy(
-#                     country_name.replace("_", " ")
-#                 )
-#             except:
-#                 msg = f"Cannot map country {country_name} to country code."
-#                 raise ValueError(msg)
-#             if len(country) > 1:
-#                 country_code = None
-#                 for current_country in country:
-#                     if current_country.name == country_name:
-#                         country_code = current_country.alpha_3
-#                 if country_code is None:
-#                     msg = (
-#                         f"Country name {country_name} has {len(country)} "
-#                         "possible results for country codes."
-#                     )
-#                     raise ValueError(msg)
-#
-#             country_code = country[0].alpha_3
-#
-#     return country_code
-
-
 files_to_read = (
     (
         "farm_gate_agriculture_energy",
@@ -160,7 +49,7 @@ for domain, release in reversed(files_to_read):
     print(f"Read {read_config["filename"]}")
     dataset_path = downloaded_data_path / domain / release / read_config["filename"]
     # There are some non-utf8 characters in Emissions_Drained_Organic_Soils_E_All_Data_NOFLAG.csv
-    df_domain = pd.read_csv(dataset_path, encoding='ISO-8859-1')
+    df_domain = pd.read_csv(dataset_path, encoding="ISO-8859-1")
 
     # remove rows by unit
     # todo this is maybe not a good idea as it hides the elements to be removed
@@ -177,43 +66,13 @@ for domain, release in reversed(files_to_read):
     if "areas_to_remove" in read_config.keys():
         df_domain = df_domain[~df_domain["Area"].isin(read_config["areas_to_remove"])]
 
-    # check for duplicates (same data, different country name)
-    # duplicates = df_domain.copy().drop(labels=["Area", "Area Code (M49)", "Area Code"], axis=1)
-    # duplicates = duplicates[duplicates.duplicated(keep=False)]
-    # if not duplicates.empty:
-    #     msg = f"Duplicate values for {domain}"
-    #     raise ValueError(msg)
-
-    # country name to ISO3 country code mapping
-    # countries_to_map = [
-    #     c for c in df_domain["Area"].unique() if c not in country_mapping.keys()
-    # ]
-    # for country_to_map in countries_to_map:
-    #     country_mapping[country_to_map] = get_country_code(country_to_map)
-
-    # make sure we don't map duplicate country codes
-    # if len(country_mapping.values()) != len(set(country_mapping.values())):
-    #     duplicate_codes = [x for i, x in enumerate(list(country_mapping.values())) if list(country_mapping.values()).count(x) > 1]
-    #     duplicates = [(key, value) for (key, value) in country_mapping.items() if value in duplicate_codes]
-    #     msg = f"Duplicate country codes for {domain}. Check country_mapping"
-    #     raise ValueError(msg)
-
-
-
     # create country columns
     df_domain["country (ISO3)"] = df_domain["Area"].map(country_to_iso3_mapping)
 
     # check all countries are converted into iso3 codes
-    if any(df_domain['country (ISO3)'].isna()):
+    if any(df_domain["country (ISO3)"].isna()):
         raise ValueError
 
-    # check for duplicates (same data, different country name)
-    # duplicates = df_domain.copy().drop(labels=["Area", "Area Code (M49)", "Area Code"], axis=1)
-    # duplicates = duplicates[duplicates.duplicated(keep=False)]
-    # if not duplicates.empty:
-    #     msg = f"Duplicate values for {domain}. Check country {duplicates['country (ISO3)'].unique()}"
-    #     raise ValueError(msg)
-
     # create entity column
     df_domain["entity"] = df_domain["Element"].map(read_config["entity_mapping"])
 
@@ -244,14 +103,17 @@ for domain, release in reversed(files_to_read):
             join="outer",
         ).reset_index(drop=True)
 
+# df_all = df_all.drop(labels=["Source"], axis=1)
+df_all["Source"] = df_all["Source"].fillna("unknown")
 coords_cols = {
     "area": "country (ISO3)",
     "unit": "Unit",
     "entity": "entity",
     "source": "Source",
+    "category" : "category"
 }
 
-coords_terminologies = {"area": "ISO3", "category": "FAOSTAT"}
+coords_terminologies = {"area": "ISO3", "category": "FAOSTAT", "scenario": "FAO"}
 
 coords_defaults = {
     # "source": "FAO",
@@ -262,15 +124,21 @@ coords_value_mapping = {}
 filter_keep = {}
 filter_remove = {}
 meta_data = {
-    "references": "tbd",
-    "rights": "tbd",
-    "contact": "tbd",
-    "title": "tbd",
-    "comment": "tbd",
-    "institution": "tbd",
+    "references": "https://www.fao.org/faostat",
+    "rights": "Creative Commons Attribution-4.0 International licence (CC BY 4.0)",
+    "contact": "daniel.busch@climate-resource.com",
+    "title": "Agrifood systems emissions",
+    "comment": (
+        "Published by Food and Agriculture Organization of the "
+        "United Nations (FAO), converted to PRIMAP2 format by "
+        "Daniel Busch"
+    ),
+    "institution": ("Food and Agriculture Organization of the United Nations"),
 }
 # Rename columns to remove the "Y" prefix
-df_all.rename(columns=lambda x: x.lstrip('Y') if x.startswith('Y') else x, inplace=True)
+df_all = df_all.rename(columns=lambda x: x.lstrip("Y") if x.startswith("Y") else x)
+df_all[df_all["entity"].isin(['FGASES (AR5GWP100)', 'KYOTOGHG (AR5GWP100)'])]["unit"]
+
 
 data_if = pm2.pm2io.convert_wide_dataframe_if(
     df_all,
@@ -287,11 +155,4 @@ data_if = pm2.pm2io.convert_wide_dataframe_if(
 data_pm2 = pm2.pm2io.from_interchange_format(data_if, data_if.attrs)
 
 # convert back to IF for standardized units
-data_if = data_pm2.pr.to_interchange_format()
-
-pass
-# steps:
-# convert to primap2 format
-# save raw data set
-# convert categories to IPCC2006_PRIMAP standard
-# save data set
+data_if = data_pm2.pr.to_interchange_format()