Ver Fonte

all land use domains [skip ci]

Daniel Busch há 6 meses atrás
pai
commit
5b28d9d465
2 ficheiros alterados com 208 adições e 19 exclusões
  1. 167 6
      src/faostat_data_primap/helper/definitions.py
  2. 41 13
      src/faostat_data_primap/read.py

+ 167 - 6
src/faostat_data_primap/helper/definitions.py

@@ -78,6 +78,7 @@ downloaded_data_path = root_path / "downloaded_data"
 
 
 # data reading
 # data reading
 areas_to_remove_base = [
 areas_to_remove_base = [
+    # We can aggregate these country groups ourselves if we need to
     "World",
     "World",
     "Africa",
     "Africa",
     "Eastern Africa",
     "Eastern Africa",
@@ -119,7 +120,11 @@ areas_to_remove_base = [
 read_config_all = {
 read_config_all = {
     "farm_gate_agriculture_energy": {
     "farm_gate_agriculture_energy": {
         "2024-11-14": {
         "2024-11-14": {
-            "units_to_remove" : ["TJ"],
+            # todo is NOFLAG the right choice?
+            "filename": "Emissions_Agriculture_Energy_E_All_Data_NOFLAG.csv",
+            # we don't need energy in Joule
+            # todo maybe explicitly deleting elements is better
+            "units_to_remove": ["TJ"],
             "areas_to_remove": [
             "areas_to_remove": [
                 *areas_to_remove_base,
                 *areas_to_remove_base,
             ],
             ],
@@ -128,21 +133,31 @@ read_config_all = {
                 "Emissions (CH4)": "CH4",
                 "Emissions (CH4)": "CH4",
                 "Emissions (N2O)": "N2O",
                 "Emissions (N2O)": "N2O",
             },
             },
-            "columns_to_drop" : ["Element", "Element Code", "Item", "Item Code", "Area Code (M49)", "Area", "Area Code"],
+            "columns_to_drop": [
+                "Element",
+                "Element Code",
+                "Item",
+                "Item Code",
+                "Area Code (M49)",
+                "Area",
+                "Area Code",
+            ],
         }
         }
     },
     },
     "farm_gate_emissions_crops": {
     "farm_gate_emissions_crops": {
         "2024-11-14": {
         "2024-11-14": {
+            "filename": "Emissions_crops_E_All_Data_NOFLAG.csv",
             "areas_to_remove": [
             "areas_to_remove": [
                 *areas_to_remove_base,
                 *areas_to_remove_base,
                 "European Union (27)",
                 "European Union (27)",
                 # This seems to be data for a Belgian province,
                 # This seems to be data for a Belgian province,
                 # I don't think we need it
                 # I don't think we need it
                 "Belgium-Luxembourg",
                 "Belgium-Luxembourg",
-                # We cannot split combined country data
+                # I'm not sure if we can downscale these two
                 "Serbia and Montenegro",
                 "Serbia and Montenegro",
             ],
             ],
             "elements_to_remove": [
             "elements_to_remove": [
+                # all these elements are not emissions
                 "Crop residues (N content)",
                 "Crop residues (N content)",
                 "Burning crop residues (Biomass burned, dry matter)",
                 "Burning crop residues (Biomass burned, dry matter)",
                 "Area harvested",
                 "Area harvested",
@@ -164,9 +179,155 @@ read_config_all = {
                 "Indirect emissions (N2O that leaches) (Synthetic fertilizers)": "N2O",
                 "Indirect emissions (N2O that leaches) (Synthetic fertilizers)": "N2O",
                 "Indirect emissions (N2O that volatilises) (Synthetic fertilizers)": "N2O",
                 "Indirect emissions (N2O that volatilises) (Synthetic fertilizers)": "N2O",
             },
             },
-            "columns_to_drop" : ["Element", "Element Code", "Item", "Item Code", "Area Code (M49)", "Area",
+            "columns_to_drop": [
-                                 "Area Code", 'Item Code (CPC)', 'Source Code'],
+                "Element",
-
+                "Element Code",
+                "Item",
+                "Item Code",
+                "Area Code (M49)",
+                "Area",
+                "Area Code",
+                "Item Code (CPC)",
+                "Source Code",
+            ],
+        }
+    },
+    "farm_gate_livestock": {
+        "2024-11-14": {
+            "filename": "Emissions_livestock_E_All_Data_NOFLAG.csv",
+            "areas_to_remove": [
+                *areas_to_remove_base,
+                "Belgium-Luxembourg",
+                "Serbia and Montenegro",
+                "European Union (27)",
+            ],
+            "elements_to_remove": [
+                "Stocks",  # number of animals
+                "Manure management (manure treated, N content)",
+                "Manure left on pasture (N content)",
+                "Manure left on pasture that leaches (N content)",
+                "Manure left on pasture that volatilises (N content)",
+                "Manure applied to soils (N content)",
+                "Manure applied to soils that leaches (N content)",
+                "Manure applied to soils that volatilises (N content)",
+            ],
+            "entity_mapping": {
+                # todo we could make this smarter and get the entity from the string
+                "Livestock total (Emissions N2O)": "N2O",
+                "Livestock total (Emissions CH4)": "CH4",
+                "Enteric fermentation (Emissions CH4)": "CH4",
+                "Manure management (Emissions CH4)": "CH4",
+                "Manure management (Emissions N2O)": "N2O",
+                "Manure management (Direct emissions N2O)": "N2O",
+                "Manure management (Indirect emissions N2O)": "N2O",
+                "Manure left on pasture (Emissions N2O)": "N2O",
+                "Manure left on pasture (Direct emissions N2O)": "N2O",
+                "Indirect emissions (N2O that leaches) (Manure on pasture)": "N2O",
+                "Indirect emissions (N2O that volatilises) (Manure on pasture)": "N2O",
+                "Manure left on pasture (Indirect emissions N2O)": "N2O",
+                "Emissions (N2O) (Manure applied)": "N2O",
+                "Manure applied to soils (Direct emissions N2O)": "N2O",
+                "Indirect emissions (N2O that leaches) (Manure applied)": "N2O",
+                "Indirect emissions (N2O that volatilises) (Manure applied)": "N2O",
+                "Manure applied to soils (Indirect emissions N2O)": "N2O",
+            },
+            "columns_to_drop": [
+                "Element",
+                "Element Code",
+                "Item",
+                "Item Code",
+                "Area Code (M49)",
+                "Area",
+                "Area Code",
+                "Item Code (CPC)",
+                "Source Code",
+            ],
+        }
+    },
+    "land_use_drained_organic_soils": {
+        "2023-11-09": {
+            "filename": "Emissions_Drained_Organic_Soils_E_All_Data_NOFLAG.csv",
+            "areas_to_remove": [
+                *areas_to_remove_base,
+                "Belgium-Luxembourg",
+                "Serbia and Montenegro",
+                "European Union (27)",
+                # check todo channel islands belong to UK
+                "Channel Islands",
+            ],
+            "elements_to_remove": [
+                "Area",
+                # todo can we convert this into emissions?
+                "Net stock change (C)",
+            ],
+            "entity_mapping": {
+                "Emissions (N2O)": "N2O",
+                "Emissions (CO2)": "CO2",
+            },
+            "columns_to_drop": [
+                "Element",
+                "Element Code",
+                "Item",
+                "Item Code",
+                "Area Code (M49)",
+                "Area",
+                "Area Code",
+                "Source Code",
+            ],
+        }
+    },
+    "land_use_fires": {
+        "2023-11-09": {
+            "filename": "Emissions_Land_Use_Fires_E_All_Data_NOFLAG.csv",
+            "areas_to_remove": [
+                *areas_to_remove_base,
+                "Belgium-Luxembourg",
+                "Serbia and Montenegro",
+                "European Union (27)",
+                # check todo channel islands belong to UK
+                "Channel Islands",
+            ],
+            "elements_to_remove": ["Biomass burned (dry matter)", "Burned Area"],
+            "entity_mapping": {
+                "Emissions (CH4)": "CH4",
+                "Emissions (N2O)": "N2O",
+                "Emissions (CO2)": "CO2",
+            },
+            "columns_to_drop": [
+                "Element",
+                "Element Code",
+                "Item",
+                "Item Code",
+                "Area Code (M49)",
+                "Area",
+                "Area Code",
+                "Source Code",
+            ],
+        }
+    },
+    "land_use_forests": {
+        "2024-11-14": {
+            "filename": "Emissions_Land_Use_Forests_E_All_Data_NOFLAG.csv",
+            "areas_to_remove": [
+                *areas_to_remove_base,
+                "Belgium-Luxembourg",
+                "Serbia and Montenegro",
+                "European Union (27)",
+            ],
+            "elements_to_remove": [
+                "Area",
+            ],
+            "entity_mapping": {"Net emissions/removals (CO2) (Forest land)": "CO2"},
+            "columns_to_drop": [
+                "Element",
+                "Element Code",
+                "Item",
+                "Item Code",
+                "Area Code (M49)",
+                "Area",
+                "Area Code",
+                "Source Code",
+            ],
         }
         }
     },
     },
 }
 }

+ 41 - 13
src/faostat_data_primap/read.py

@@ -8,9 +8,7 @@ from src.faostat_data_primap.helper.definitions import (
     downloaded_data_path,
     downloaded_data_path,
     read_config_all,
     read_config_all,
 )
 )
-
 custom_country_mapping_code = {}
 custom_country_mapping_code = {}
-
 custom_country_mapping_name = {
 custom_country_mapping_name = {
     # farm gate agricultur energy
     # farm gate agricultur energy
     "Bolivia (Plurinational State of)": "BOL",
     "Bolivia (Plurinational State of)": "BOL",
@@ -29,7 +27,7 @@ custom_country_mapping_name = {
     "Venezuela (Bolivarian Republic of)": "VEN",
     "Venezuela (Bolivarian Republic of)": "VEN",
     "Yugoslav SFR": "YUG",
     "Yugoslav SFR": "YUG",
     "World": "EARTH",
     "World": "EARTH",
-    # Andrew cement (probably not needed)
+    # todo Andrews cement list below (deleted commented lines)
     # "Bonaire, Saint Eustatius and Saba": "BES",
     # "Bonaire, Saint Eustatius and Saba": "BES",
     # "Cape Verde": "CPV",
     # "Cape Verde": "CPV",
     "Democratic Republic of the Congo": "COD",
     "Democratic Republic of the Congo": "COD",
@@ -43,6 +41,14 @@ custom_country_mapping_name = {
     "Wallis and Futuna Islands": "WLF",
     "Wallis and Futuna Islands": "WLF",
     # farm gate emissions crops
     # farm gate emissions crops
     "United States Virgin Islands": "VIR",
     "United States Virgin Islands": "VIR",
+    # todo is this relevant to us?
+    'Pacific Islands Trust Territory' : 'PIC',
+    'Svalbard and Jan Mayen Islands' : "SJM",  # Norwy
+    # something goes wrong with french characters in land_use_forest
+    "Côte d'Ivoire" : "CIV",
+'Curaçao' : "CUW",
+   "Réunion" : "REU",
+'Türkiye' : "TUR",
 }
 }
 
 
 
 
@@ -104,25 +110,42 @@ files_to_read = (
     (
     (
         "farm_gate_agriculture_energy",
         "farm_gate_agriculture_energy",
         "2024-11-14",
         "2024-11-14",
-        "Emissions_Agriculture_Energy_E_All_Data_NOFLAG.csv",
     ),
     ),
     (
     (
         "farm_gate_emissions_crops",
         "farm_gate_emissions_crops",
         "2024-11-14",
         "2024-11-14",
-        "Emissions_crops_E_All_Data_NOFLAG.csv",
     ),
     ),
+    (
+        "farm_gate_livestock",
+        "2024-11-14",
+    ),
+    (
+        "land_use_drained_organic_soils",
+        "2023-11-09",
+    ),
+    (
+        "land_use_fires",
+        "2023-11-09",
+    ),
+    (
+    "land_use_forests",
+        "2024-11-14",
+    )
 )
 )
 
 
 df_all = None
 df_all = None
-for domain, release, filename in files_to_read:
+country_mapping = {}
-    dataset_path = downloaded_data_path / domain / release / filename
+# todo remove reversed, I'm using it to get the new domain first in the debugger
+for domain, release in reversed(files_to_read):
     read_config = read_config_all[domain][release]
     read_config = read_config_all[domain][release]
 
 
-    df_domain = pd.read_csv(dataset_path)
+    print(f"Read {read_config["filename"]}")
+    dataset_path = downloaded_data_path / domain / release / read_config["filename"]
+    # There are some non-utf8 characters in Emissions_Drained_Organic_Soils_E_All_Data_NOFLAG.csv
+    df_domain = pd.read_csv(dataset_path, encoding = "ISO-8859-1")
 
 
     # remove rows by unit
     # remove rows by unit
-    # todo align pattern with below
+    # todo this is maybe not a good idea as it hides the elements to be removed
-    # df_domain = df_domain[df_domain["Unit"] != "TJ"]
     if "units_to_remove" in read_config.keys():
     if "units_to_remove" in read_config.keys():
         df_domain = df_domain[~df_domain["Unit"].isin(read_config["units_to_remove"])]
         df_domain = df_domain[~df_domain["Unit"].isin(read_config["units_to_remove"])]
 
 
@@ -136,8 +159,10 @@ for domain, release, filename in files_to_read:
     if "areas_to_remove" in read_config.keys():
     if "areas_to_remove" in read_config.keys():
         df_domain = df_domain[~df_domain["Area"].isin(read_config["areas_to_remove"])]
         df_domain = df_domain[~df_domain["Area"].isin(read_config["areas_to_remove"])]
 
 
-    # todo we shouldn't re-compute this everytime
+    # country name to ISO3 country code mapping
-    country_mapping = {c: get_country_code(c) for c in df_domain["Area"].unique()}
+    countries_to_map = [c for c in df_domain["Area"].unique() if c not in country_mapping.keys()]
+    for country_to_map in countries_to_map:
+        country_mapping[country_to_map] = get_country_code(country_to_map)
 
 
     # create country columns
     # create country columns
     df_domain["country (ISO3)"] = df_domain["Area"].map(country_mapping)
     df_domain["country (ISO3)"] = df_domain["Area"].map(country_mapping)
@@ -156,6 +181,7 @@ for domain, release, filename in files_to_read:
 
 
     if df_all is None:
     if df_all is None:
         df_all = df_domain
         df_all = df_domain
+        break
     else:
     else:
         # makes sure there are no duplicate category names
         # makes sure there are no duplicate category names
         if any(
         if any(
@@ -176,12 +202,13 @@ coords_cols = {
     "area": "country (ISO3)",
     "area": "country (ISO3)",
     "unit": "Unit",
     "unit": "Unit",
     "entity": "entity",
     "entity": "entity",
+    "source" : "Source"
 }
 }
 
 
 coords_terminologies = {"area": "ISO3", "category": "FAOSTAT"}
 coords_terminologies = {"area": "ISO3", "category": "FAOSTAT"}
 
 
 coords_defaults = {
 coords_defaults = {
-    "source": "FAO",
+    # "source": "FAO",
     "scenario": release,
     "scenario": release,
 }
 }
 
 
@@ -208,6 +235,7 @@ data_if = pm2.pm2io.convert_wide_dataframe_if(
     meta_data=meta_data,
     meta_data=meta_data,
 )
 )
 
 
+pass
 # steps:
 # steps:
 # convert to primap2 format
 # convert to primap2 format
 # save raw data set
 # save raw data set