|
@@ -2,124 +2,13 @@
|
|
|
|
|
|
import pandas as pd
|
|
|
import primap2 as pm2
|
|
|
-import pycountry
|
|
|
|
|
|
from src.faostat_data_primap.helper.definitions import (
|
|
|
+ country_to_iso3_mapping,
|
|
|
downloaded_data_path,
|
|
|
read_config_all,
|
|
|
-country_to_iso3_mapping,
|
|
|
)
|
|
|
|
|
|
-# mapping = pd.read_csv("../../FAOSTAT_data_11-19-2024.csv")#, encoding="ISO-8859-1")
|
|
|
-# mapping_dict = {}
|
|
|
-# for idx, row in mapping.iterrows():
|
|
|
-# mapping_dict[row['Country']] = row['ISO3 Code']
|
|
|
-
|
|
|
-
|
|
|
-# custom_country_mapping_code = {}
|
|
|
-# custom_country_mapping_name = {
|
|
|
-# # farm gate agricultur energy
|
|
|
-# "Bolivia (Plurinational State of)": "BOL",
|
|
|
-# "China, Hong Kong SAR": "HKG",
|
|
|
-# "China, Macao SAR": "MAC",
|
|
|
-# "China, mainland": "CHN",
|
|
|
-# "China, Taiwan Province of": "TWN",
|
|
|
-# "Iran (Islamic Republic of)": "IRN",
|
|
|
-# "Czechoslovakia": "CSK",
|
|
|
-# "Ethiopia PDR": "ETH",
|
|
|
-# "Netherlands (Kingdom of the)": "NLD",
|
|
|
-# "Netherlands Antilles (former)": "ANT",
|
|
|
-# # todo is former Sudan same as the new (north) Sudan
|
|
|
-# "Sudan (former)": "SDN",
|
|
|
-# "USSR": "SUN",
|
|
|
-# "Venezuela (Bolivarian Republic of)": "VEN",
|
|
|
-# "Yugoslav SFR": "YUG",
|
|
|
-# "World": "EARTH",
|
|
|
-# # todo Andrews cement list below (deleted commented lines)
|
|
|
-# # "Bonaire, Saint Eustatius and Saba": "BES",
|
|
|
-# # "Cape Verde": "CPV",
|
|
|
-# "Democratic Republic of the Congo": "COD",
|
|
|
-# # "Faeroe Islands": "FRO",
|
|
|
-# "Micronesia (Federated States of)": "FSM",
|
|
|
-# # "Iran": "IRN",
|
|
|
-# # "Laos": "LAO",
|
|
|
-# # "Occupied Palestinian Territory": "PSE",
|
|
|
-# # "Swaziland": "SWZ",
|
|
|
-# # "Taiwan": "TWN",
|
|
|
-# "Wallis and Futuna Islands": "WLF",
|
|
|
-# # farm gate emissions crops
|
|
|
-# "United States Virgin Islands": "VIR",
|
|
|
-# # todo is this relevant to us?
|
|
|
-# "Pacific Islands Trust Territory": "PIC",
|
|
|
-# "Svalbard and Jan Mayen Islands": "SJM", # Norwy
|
|
|
-# # something goes wrong with french characters in land_use_forest
|
|
|
-# "Côte d'Ivoire": "CIV",
|
|
|
-# "Curaçao": "CUW",
|
|
|
-# "Réunion": "REU",
|
|
|
-# "Türkiye": "TUR",
|
|
|
-# # pycountry mixes up these
|
|
|
-# 'Niger' : 'NER',
|
|
|
-# 'Nigeria' : 'NGA',
|
|
|
-# "Curaçao" : "CUW",
|
|
|
-# "Republic of Korea" : 'KOR',
|
|
|
-# "Democratic People's Republic of Korea" : "PRK",
|
|
|
-# }
|
|
|
-
|
|
|
-#
|
|
|
-# def get_country_code(
|
|
|
-# country_name: str,
|
|
|
-# ) -> str:
|
|
|
-# """
|
|
|
-# Get country code for country name.
|
|
|
-#
|
|
|
-# If the input is a code it will be returned,
|
|
|
-# if the input is not a three-letter code a search will be performed
|
|
|
-#
|
|
|
-# Parameters
|
|
|
-# ----------
|
|
|
-# country_name: str
|
|
|
-# Country code or name to get the three-letter code for.
|
|
|
-#
|
|
|
-# Returns
|
|
|
-# -------
|
|
|
-# country_code: str
|
|
|
-#
|
|
|
-# """
|
|
|
-# # First check if it's in the list of custom codes
|
|
|
-# if country_name in custom_country_mapping_code:
|
|
|
-# country_code = country_name
|
|
|
-# elif country_name in custom_country_mapping_name:
|
|
|
-# country_code = custom_country_mapping_name[country_name]
|
|
|
-# else:
|
|
|
-# try:
|
|
|
-# # check if it's a 3 letter UNFCCC_GHG_data
|
|
|
-# country = pycountry.countries.get(alpha_3=country_name)
|
|
|
-# country_code = country.alpha_3
|
|
|
-# except:
|
|
|
-# try:
|
|
|
-# country = pycountry.countries.search_fuzzy(
|
|
|
-# country_name.replace("_", " ")
|
|
|
-# )
|
|
|
-# except:
|
|
|
-# msg = f"Cannot map country {country_name} to country code."
|
|
|
-# raise ValueError(msg)
|
|
|
-# if len(country) > 1:
|
|
|
-# country_code = None
|
|
|
-# for current_country in country:
|
|
|
-# if current_country.name == country_name:
|
|
|
-# country_code = current_country.alpha_3
|
|
|
-# if country_code is None:
|
|
|
-# msg = (
|
|
|
-# f"Country name {country_name} has {len(country)} "
|
|
|
-# "possible results for country codes."
|
|
|
-# )
|
|
|
-# raise ValueError(msg)
|
|
|
-#
|
|
|
-# country_code = country[0].alpha_3
|
|
|
-#
|
|
|
-# return country_code
|
|
|
-
|
|
|
-
|
|
|
files_to_read = (
|
|
|
(
|
|
|
"farm_gate_agriculture_energy",
|
|
@@ -160,7 +49,7 @@ for domain, release in reversed(files_to_read):
|
|
|
print(f"Read {read_config["filename"]}")
|
|
|
dataset_path = downloaded_data_path / domain / release / read_config["filename"]
|
|
|
# There are some non-utf8 characters in Emissions_Drained_Organic_Soils_E_All_Data_NOFLAG.csv
|
|
|
- df_domain = pd.read_csv(dataset_path, encoding='ISO-8859-1')
|
|
|
+ df_domain = pd.read_csv(dataset_path, encoding="ISO-8859-1")
|
|
|
|
|
|
# remove rows by unit
|
|
|
# todo this is maybe not a good idea as it hides the elements to be removed
|
|
@@ -177,43 +66,13 @@ for domain, release in reversed(files_to_read):
|
|
|
if "areas_to_remove" in read_config.keys():
|
|
|
df_domain = df_domain[~df_domain["Area"].isin(read_config["areas_to_remove"])]
|
|
|
|
|
|
- # check for duplicates (same data, different country name)
|
|
|
- # duplicates = df_domain.copy().drop(labels=["Area", "Area Code (M49)", "Area Code"], axis=1)
|
|
|
- # duplicates = duplicates[duplicates.duplicated(keep=False)]
|
|
|
- # if not duplicates.empty:
|
|
|
- # msg = f"Duplicate values for {domain}"
|
|
|
- # raise ValueError(msg)
|
|
|
-
|
|
|
- # country name to ISO3 country code mapping
|
|
|
- # countries_to_map = [
|
|
|
- # c for c in df_domain["Area"].unique() if c not in country_mapping.keys()
|
|
|
- # ]
|
|
|
- # for country_to_map in countries_to_map:
|
|
|
- # country_mapping[country_to_map] = get_country_code(country_to_map)
|
|
|
-
|
|
|
- # make sure we don't map duplicate country codes
|
|
|
- # if len(country_mapping.values()) != len(set(country_mapping.values())):
|
|
|
- # duplicate_codes = [x for i, x in enumerate(list(country_mapping.values())) if list(country_mapping.values()).count(x) > 1]
|
|
|
- # duplicates = [(key, value) for (key, value) in country_mapping.items() if value in duplicate_codes]
|
|
|
- # msg = f"Duplicate country codes for {domain}. Check country_mapping"
|
|
|
- # raise ValueError(msg)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
# create country columns
|
|
|
df_domain["country (ISO3)"] = df_domain["Area"].map(country_to_iso3_mapping)
|
|
|
|
|
|
# check all countries are converted into iso3 codes
|
|
|
- if any(df_domain['country (ISO3)'].isna()):
|
|
|
+ if any(df_domain["country (ISO3)"].isna()):
|
|
|
raise ValueError
|
|
|
|
|
|
- # check for duplicates (same data, different country name)
|
|
|
- # duplicates = df_domain.copy().drop(labels=["Area", "Area Code (M49)", "Area Code"], axis=1)
|
|
|
- # duplicates = duplicates[duplicates.duplicated(keep=False)]
|
|
|
- # if not duplicates.empty:
|
|
|
- # msg = f"Duplicate values for {domain}. Check country {duplicates['country (ISO3)'].unique()}"
|
|
|
- # raise ValueError(msg)
|
|
|
-
|
|
|
# create entity column
|
|
|
df_domain["entity"] = df_domain["Element"].map(read_config["entity_mapping"])
|
|
|
|
|
@@ -244,14 +103,17 @@ for domain, release in reversed(files_to_read):
|
|
|
join="outer",
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
+# df_all = df_all.drop(labels=["Source"], axis=1)
|
|
|
+df_all["Source"] = df_all["Source"].fillna("unknown")
|
|
|
coords_cols = {
|
|
|
"area": "country (ISO3)",
|
|
|
"unit": "Unit",
|
|
|
"entity": "entity",
|
|
|
"source": "Source",
|
|
|
+ "category" : "category"
|
|
|
}
|
|
|
|
|
|
-coords_terminologies = {"area": "ISO3", "category": "FAOSTAT"}
|
|
|
+coords_terminologies = {"area": "ISO3", "category": "FAOSTAT", "scenario": "FAO"}
|
|
|
|
|
|
coords_defaults = {
|
|
|
# "source": "FAO",
|
|
@@ -262,15 +124,21 @@ coords_value_mapping = {}
|
|
|
filter_keep = {}
|
|
|
filter_remove = {}
|
|
|
meta_data = {
|
|
|
- "references": "tbd",
|
|
|
- "rights": "tbd",
|
|
|
- "contact": "tbd",
|
|
|
- "title": "tbd",
|
|
|
- "comment": "tbd",
|
|
|
- "institution": "tbd",
|
|
|
+ "references": "https://www.fao.org/faostat",
|
|
|
+ "rights": "Creative Commons Attribution-4.0 International licence (CC BY 4.0)",
|
|
|
+ "contact": "daniel.busch@climate-resource.com",
|
|
|
+ "title": "Agrifood systems emissions",
|
|
|
+ "comment": (
|
|
|
+ "Published by Food and Agriculture Organization of the "
|
|
|
+ "United Nations (FAO), converted to PRIMAP2 format by "
|
|
|
+ "Daniel Busch"
|
|
|
+ ),
|
|
|
+ "institution": ("Food and Agriculture Organization of the United Nations"),
|
|
|
}
|
|
|
# Rename columns to remove the "Y" prefix
|
|
|
-df_all.rename(columns=lambda x: x.lstrip('Y') if x.startswith('Y') else x, inplace=True)
|
|
|
+df_all = df_all.rename(columns=lambda x: x.lstrip("Y") if x.startswith("Y") else x)
|
|
|
+df_all[df_all["entity"].isin(['FGASES (AR5GWP100)', 'KYOTOGHG (AR5GWP100)'])]["unit"]
|
|
|
+
|
|
|
|
|
|
data_if = pm2.pm2io.convert_wide_dataframe_if(
|
|
|
df_all,
|
|
@@ -287,11 +155,4 @@ data_if = pm2.pm2io.convert_wide_dataframe_if(
|
|
|
data_pm2 = pm2.pm2io.from_interchange_format(data_if, data_if.attrs)
|
|
|
|
|
|
# convert back to IF for standardized units
|
|
|
-data_if = data_pm2.pr.to_interchange_format()
|
|
|
-
|
|
|
-pass
|
|
|
-# steps:
|
|
|
-# convert to primap2 format
|
|
|
-# save raw data set
|
|
|
-# convert categories to IPCC2006_PRIMAP standard
|
|
|
-# save data set
|
|
|
+data_if = data_pm2.pr.to_interchange_format()
|