4 months ago · 6be433dace
--- a/src/faostat_data_primap/helper/definitions.py
+++ b/src/faostat_data_primap/helper/definitions.py
@@ -1,4 +1,5 @@
 
				 """definitions like folders, mappings etc."""
			
 
				+
			
 
				 from pathlib import Path
			
 
				 
			
 
				 domains = {
			
@@ -74,3 +75,98 @@ root_path = get_root_path()
 
				 code_path = root_path / "src" / "faostat_data_primap"
			
 
				 extracted_data_path = root_path / "extracted_data"
			
 
				 downloaded_data_path = root_path / "downloaded_data"
			
 
				+
			
 
				+# data reading
			
 
				+areas_to_remove_base = [
			
 
				+    "World",
			
 
				+    "Africa",
			
 
				+    "Eastern Africa",
			
 
				+    "Middle Africa",
			
 
				+    "Northern Africa",
			
 
				+    "Southern Africa",
			
 
				+    "Western Africa",
			
 
				+    "Americas",
			
 
				+    "Northern America",
			
 
				+    "Central America",
			
 
				+    "Caribbean",
			
 
				+    "South America",
			
 
				+    "Asia",
			
 
				+    "Central Asia",
			
 
				+    "Eastern Asia",
			
 
				+    "Southern Asia",
			
 
				+    "South-eastern Asia",
			
 
				+    "Western Asia",
			
 
				+    "Europe",
			
 
				+    "Eastern Europe",
			
 
				+    "Northern Europe",
			
 
				+    "Southern Europe",
			
 
				+    "Western Europe",
			
 
				+    "Oceania",
			
 
				+    "Australia and New Zealand",
			
 
				+    "Melanesia",
			
 
				+    "Micronesia",
			
 
				+    "Polynesia",
			
 
				+    "Least Developed Countries",
			
 
				+    "Land Locked Developing Countries",
			
 
				+    "Small Island Developing States",
			
 
				+    "Low Income Food Deficit Countries",
			
 
				+    "Net Food Importing Developing Countries",
			
 
				+    "Annex I countries",
			
 
				+    "Non-Annex I countries",
			
 
				+    "OECD",
			
 
				+]
			
 
				+
			
 
				+read_config_all = {
			
 
				+    "farm_gate_agriculture_energy": {
			
 
				+        "2024-11-14": {
			
 
				+            "units_to_remove" : ["TJ"],
			
 
				+            "areas_to_remove": [
			
 
				+                *areas_to_remove_base,
			
 
				+            ],
			
 
				+            "entity_mapping": {
			
 
				+                "Emissions (CO2)": "CO2",
			
 
				+                "Emissions (CH4)": "CH4",
			
 
				+                "Emissions (N2O)": "N2O",
			
 
				+            },
			
 
				+            "columns_to_drop" : ["Element", "Element Code", "Item", "Item Code", "Area Code (M49)", "Area", "Area Code"],
			
 
				+        }
			
 
				+    },
			
 
				+    "farm_gate_emissions_crops": {
			
 
				+        "2024-11-14": {
			
 
				+            "areas_to_remove": [
			
 
				+                *areas_to_remove_base,
			
 
				+                "European Union (27)",
			
 
				+                # This seems to be data for a Belgian province,
			
 
				+                # I don't think we need it
			
 
				+                "Belgium-Luxembourg",
			
 
				+                # We cannot split combined country data
			
 
				+                "Serbia and Montenegro",
			
 
				+            ],
			
 
				+            "elements_to_remove": [
			
 
				+                "Crop residues (N content)",
			
 
				+                "Burning crop residues (Biomass burned, dry matter)",
			
 
				+                "Area harvested",
			
 
				+                "Nitrogen fertilizer content applied that leaches",
			
 
				+                "Nitrogen fertilizer content applied that volatilises",
			
 
				+                "Synthetic fertilizers (Agricultural use)",
			
 
				+            ],
			
 
				+            "entity_mapping": {
			
 
				+                "Crop residues (Emissions N2O)": "N2O",
			
 
				+                "Crop residues (Direct emissions N2O)": "N2O",
			
 
				+                "Crop residues (Indirect emissions N2O)": "N2O",
			
 
				+                "Burning crop residues (Emissions N2O)": "N2O",
			
 
				+                "Burning crop residues (Emissions CH4)": "CH4",
			
 
				+                "Rice cultivation (Emissions CH4)": "CH4",
			
 
				+                "Crops total (Emissions N2O)": "N2O",
			
 
				+                "Crops total (Emissions CH4)": "CH4",
			
 
				+                "Synthetic fertilizers (Emissions N2O)": "N2O",
			
 
				+                "Synthetic fertilizers (Direct emissions N2O)": "N2O",
			
 
				+                "Indirect emissions (N2O that leaches) (Synthetic fertilizers)": "N2O",
			
 
				+                "Indirect emissions (N2O that volatilises) (Synthetic fertilizers)": "N2O",
			
 
				+            },
			
 
				+            "columns_to_drop" : ["Element", "Element Code", "Item", "Item Code", "Area Code (M49)", "Area",
			
 
				+                                 "Area Code", 'Item Code (CPC)', 'Source Code'],
			
 
				+
			
 
				+        }
			
 
				+    },
			
 
				+}
			
--- a/src/faostat_data_primap/read.py
+++ b/src/faostat_data_primap/read.py
@@ -1,16 +1,18 @@
 
				 """read data set"""
			
 
				 
			
 
				-
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				 import pycountry
			
 
				 
			
 
				-from src.faostat_data_primap.helper.definitions import downloaded_data_path
			
 
				+from src.faostat_data_primap.helper.definitions import (
			
 
				+    downloaded_data_path,
			
 
				+    read_config_all,
			
 
				+)
			
 
				 
			
 
				 custom_country_mapping_code = {}
			
 
				 
			
 
				 custom_country_mapping_name = {
			
 
				-    # FAO
			
 
				+    # farm gate agricultur energy
			
 
				     "Bolivia (Plurinational State of)": "BOL",
			
 
				     "China, Hong Kong SAR": "HKG",
			
 
				     "China, Macao SAR": "MAC",
			
@@ -28,17 +30,19 @@ custom_country_mapping_name = {
 
				     "Yugoslav SFR": "YUG",
			
 
				     "World": "EARTH",
			
 
				     # Andrew cement (probably not needed)
			
 
				-    "Bonaire, Saint Eustatius and Saba": "BES",
			
 
				-    "Cape Verde": "CPV",
			
 
				+    # "Bonaire, Saint Eustatius and Saba": "BES",
			
 
				+    # "Cape Verde": "CPV",
			
 
				     "Democratic Republic of the Congo": "COD",
			
 
				-    "Faeroe Islands": "FRO",
			
 
				+    # "Faeroe Islands": "FRO",
			
 
				     "Micronesia (Federated States of)": "FSM",
			
 
				-    "Iran": "IRN",
			
 
				-    "Laos": "LAO",
			
 
				-    "Occupied Palestinian Territory": "PSE",
			
 
				-    "Swaziland": "SWZ",
			
 
				-    "Taiwan": "TWN",
			
 
				+    # "Iran": "IRN",
			
 
				+    # "Laos": "LAO",
			
 
				+    # "Occupied Palestinian Territory": "PSE",
			
 
				+    # "Swaziland": "SWZ",
			
 
				+    # "Taiwan": "TWN",
			
 
				     "Wallis and Futuna Islands": "WLF",
			
 
				+    # farm gate emissions crops
			
 
				+    "United States Virgin Islands": "VIR",
			
 
				 }
			
 
				 
			
 
				 
			
@@ -109,75 +113,67 @@ files_to_read = (
 
				     ),
			
 
				 )
			
 
				 
			
 
				+df_all = None
			
 
				 for domain, release, filename in files_to_read:
			
 
				     dataset_path = downloaded_data_path / domain / release / filename
			
 
				-    data_pd = pd.read_csv(dataset_path)
			
 
				-
			
 
				-    # remove in entries with unit TJ
			
 
				-    data_pd = data_pd[data_pd["Unit"] != "TJ"]
			
 
				-
			
 
				-    # remove the country aggegrates
			
 
				-    areas_to_remove = [
			
 
				-        "World",
			
 
				-        "Africa",
			
 
				-        "Eastern Africa",
			
 
				-        "Middle Africa",
			
 
				-        "Northern Africa",
			
 
				-        "Southern Africa",
			
 
				-        "Western Africa",
			
 
				-        "Americas",
			
 
				-        "Northern America",
			
 
				-        "Central America",
			
 
				-        "Caribbean",
			
 
				-        "South America",
			
 
				-        "Asia",
			
 
				-        "Central Asia",
			
 
				-        "Eastern Asia",
			
 
				-        "Southern Asia",
			
 
				-        "South-eastern Asia",
			
 
				-        "Western Asia",
			
 
				-        "Europe",
			
 
				-        "Eastern Europe",
			
 
				-        "Northern Europe",
			
 
				-        "Southern Europe",
			
 
				-        "Western Europe",
			
 
				-        "Oceania",
			
 
				-        "Australia and New Zealand",
			
 
				-        "Melanesia",
			
 
				-        "Micronesia",
			
 
				-        "Polynesia",
			
 
				-        "Least Developed Countries",
			
 
				-        "Land Locked Developing Countries",
			
 
				-        "Small Island Developing States",
			
 
				-        "Low Income Food Deficit Countries",
			
 
				-        "Net Food Importing Developing Countries",
			
 
				-        "Annex I countries",
			
 
				-        "Non-Annex I countries",
			
 
				-        "OECD",
			
 
				-    ]
			
 
				-
			
 
				-    data_pd = data_pd[~data_pd["Area"].isin(areas_to_remove)]
			
 
				-    country_mapping = {c: get_country_code(c) for c in data_pd["Area"].unique()}
			
 
				-
			
 
				-    data_pd["country (ISO3)"] = data_pd["Area"].map(country_mapping)
			
 
				-
			
 
				-    entity_mapping = {
			
 
				-        "Emissions (CO2)": "CO2",
			
 
				-        "Emissions (CH4)": "CH4",
			
 
				-        "Emissions (N2O)": "N2O",
			
 
				-    }
			
 
				-
			
 
				-    data_pd["entity"] = data_pd["Element"].map(entity_mapping)
			
 
				-
			
 
				-    # todo can we do this in primap2 function?
			
 
				-    data_pd = data_pd.drop(
			
 
				-        ["Element", "Element Code", "Item Code", "Area Code (M49)", "Area Code"], axis=1
			
 
				+    read_config = read_config_all[domain][release]
			
 
				+
			
 
				+    df_domain = pd.read_csv(dataset_path)
			
 
				+
			
 
				+    # remove rows by unit
			
 
				+    # todo align pattern with below
			
 
				+    # df_domain = df_domain[df_domain["Unit"] != "TJ"]
			
 
				+    if "units_to_remove" in read_config.keys():
			
 
				+        df_domain = df_domain[~df_domain["Unit"].isin(read_config["units_to_remove"])]
			
 
				+
			
 
				+    # remove rows by element
			
 
				+    if "elements_to_remove" in read_config.keys():
			
 
				+        df_domain = df_domain[
			
 
				+            ~df_domain["Element"].isin(read_config["elements_to_remove"])
			
 
				+        ]
			
 
				+
			
 
				+    # remove rows by area
			
 
				+    if "areas_to_remove" in read_config.keys():
			
 
				+        df_domain = df_domain[~df_domain["Area"].isin(read_config["areas_to_remove"])]
			
 
				+
			
 
				+    # todo we shouldn't re-compute this everytime
			
 
				+    country_mapping = {c: get_country_code(c) for c in df_domain["Area"].unique()}
			
 
				+
			
 
				+    # create country columns
			
 
				+    df_domain["country (ISO3)"] = df_domain["Area"].map(country_mapping)
			
 
				+
			
 
				+    # create entity column
			
 
				+    df_domain["entity"] = df_domain["Element"].map(read_config["entity_mapping"])
			
 
				+
			
 
				+    # create category column (combination of Item and Element works best)
			
 
				+    df_domain["category"] = df_domain["Item"] + " " + df_domain["Element"]
			
 
				+
			
 
				+    # drop columns we don't need
			
 
				+    df_domain = df_domain.drop(
			
 
				+        read_config["columns_to_drop"],
			
 
				+        axis=1,
			
 
				     )
			
 
				 
			
 
				+    if df_all is None:
			
 
				+        df_all = df_domain
			
 
				+    else:
			
 
				+        # makes sure there are no duplicate category names
			
 
				+        if any(
			
 
				+            [
			
 
				+                category in df_all["category"].unique()
			
 
				+                for category in df_domain["category"].unique()
			
 
				+            ]
			
 
				+        ):
			
 
				+            msg = f"Duplicate category names for {domain}"
			
 
				+            raise ValueError(msg)
			
 
				+        df_all = pd.concat(
			
 
				+            [df_all, df_domain],
			
 
				+            axis=0,
			
 
				+            join="outer",
			
 
				+        ).reset_index(drop=True)
			
 
				 
			
 
				 coords_cols = {
			
 
				     "area": "country (ISO3)",
			
 
				-    "category": "Item",
			
 
				     "unit": "Unit",
			
 
				     "entity": "entity",
			
 
				 }
			
@@ -202,7 +198,7 @@ meta_data = {
 
				 }
			
 
				 
			
 
				 data_if = pm2.pm2io.convert_wide_dataframe_if(
			
 
				-    data_pd,
			
 
				+    df_all,
			
 
				     coords_cols=coords_cols,
			
 
				     coords_defaults=coords_defaults,
			
 
				     coords_terminologies=coords_terminologies,