9 kuukautta sitten · a3b797de2d
--- a/.github/workflows/download.yaml
+++ b/.github/workflows/download.yaml
@@ -48,7 +48,7 @@ jobs:
 
				       - name: update to latest data from gin.hemio.de
			
 
				         run: datalad -C unfccc_di_data/ update -s ginhemio --how merge
			
 
				       - name: download data
			
 
				-        run: datalad -C unfccc_di_data/ rerun download
			
 
				+        run: datalad -C unfccc_di_data/ run -o data/annexI/*.csv.gz -o data/non-annexI/*.csv.gz -o data/all.parquet python download.py
			
 
				       - name: publish newly downloaded data
			
 
				         run: |
			
 
				           cd unfccc_di_data/
			
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ References:
 
				 All the data included in this dataset is available from the UNFCCC API, which sources the data from:
			
 
				 * GHG inventory data: UNFCCC: Greenhouse Gas Inventory Data, available at https://unfccc.int/process/transparency-and-reporting/greenhouse-gas-data/what-is-greenhouse-gas-data
			
 
				 * Population data: UNSD Demographic Statistics, available at http://data.un.org
			
 
				-* GDP data: The World Bank GDP data, available at https://data.worldbank.org/ and shared by The World Bank under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/) and pusuant to their [terms of use](https://data.worldbank.org/summary-terms-of-use).
			
 
				+* GDP data: The World Bank GDP data, available at https://data.worldbank.org/ and shared by The World Bank under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/) and pursuant to their [terms of use](https://data.worldbank.org/summary-terms-of-use).
			
 
				 
			
 
				 Included is also a python script `download.py` to download the most recent version of the data from the UNFCCC API, which uses the [unfccc-di-api library](https://pypi.org/project/unfccc-di-api/).
			
 
				 
			
--- a/download.py
+++ b/download.py
@@ -1,14 +1,17 @@
 
				+"""Download the data for all countries from the """
			
 
				+import pandas as pd
			
 
				 import tqdm
			
 
				 import os
			
 
				 import pathlib
			
 
				 
			
 
				 from unfccc_di_api import UNFCCCApiReader
			
 
				 
			
 
				-ROOT_DIR = pathlib.Path(os.path.abspath(os.curdir))  # This is your Project Root
			
 
				+ROOT_DIR = pathlib.Path(os.path.abspath(os.curdir))
			
 
				 
			
 
				 
			
 
				 def main():
			
 
				     r = UNFCCCApiReader()
			
 
				+    dfs = []
			
 
				     for party in tqdm.tqdm(r.parties["code"], desc="parties"):
			
 
				         df = r.query(party_code=party, progress=False)
			
 
				 
			
@@ -16,8 +19,29 @@ def main():
 
				         subdir = "annexI" if annexI else "non-annexI"
			
 
				         directory = ROOT_DIR / "data" / subdir
			
 
				         directory.mkdir(parents=True, exist_ok=True)
			
 
				-        df.to_csv(directory / f"{party}.csv.gz", compression="gzip")
			
 
				-        df.to_parquet(directory / f"{party}.parquet", compression="brotli")
			
 
				+        # CSV compressed with gzip is a very widely used standard
			
 
				+        # pass mtime=0 explicitly so that the creation time is not embedded in the
			
 
				+        # produced gzip file, which means that the gzip file doesn't change if the
			
 
				+        # contents haven't changed.
			
 
				+        df.to_csv(
			
 
				+            directory / f"{party}.csv.gz",
			
 
				+            compression={"method": 'gzip', "mtime": 0}
			
 
				+        )
			
 
				+        dfs.append(df)
			
 
				+
			
 
				+    # Save data for all parties into one big file for easy distribution.
			
 
				+    # parquet with zstd compression is a very efficient binary format
			
 
				+    # unfortunately, it embeds the used software versions in the metadata so
			
 
				+    # generally, the parquet file can change even if the contents haven't changed.
			
 
				+    # We disable index and statistics which we don't use.
			
 
				+    df_all = pd.concat(dfs)
			
 
				+    df_all.to_parquet(
			
 
				+        ROOT_DIR / "data" / "all.parquet",
			
 
				+        engine="fastparquet",
			
 
				+        compression="zstd",
			
 
				+        index=False,
			
 
				+        stats=False
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/howto_new_release.md
+++ b/howto_new_release.md
@@ -1,4 +1,11 @@
 
				 # How to create a new release
			
 
				+
			
 
				+New data is automatically downloaded and added to the dataset every month. While this
			
 
				+is useful for data archeology in the datalad dataset, an official release on
			
 
				+zenodo is needed so you can cite the data and third parties can easily re-use the data.
			
 
				+We therefore aim to make a new release whenever relevant changes happened or at least
			
 
				+twice a year.
			
 
				+
			
 
				 ## 1. download and publish new data
			
 
				 Go to https://github.com/mikapfl/unfccc_di_data/actions/workflows/download.yaml and
			
 
				 trigger a new run of the "download" workflow. This will download the latest data from
			
@@ -43,10 +50,9 @@ datalad push
 
				 datalad push --to origin
			
 
				 datalad push --to ginhemio --data anything
			
 
				 datalad export-archive -t zip "data-$(date --iso).zip"
			
 
				-zip parquet-only.zip data/annexI/*.parquet data/non-annexI/*.parquet
			
 
				 ```
			
 
				 
			
 
				-upload the new data (consisting of the data-{date}.zip and parquet-only.zip files) to
			
 
				+upload the new data (consisting of the data-{date}.zip and data/all.parquet files) to
			
 
				 zenodo.
			
 
				 
			
 
				 ## 4. start using the new version
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,4 @@ pandas
 
				 treelib
			
 
				 tqdm
			
 
				 requests
			
 
				-pyarrow
			
 
				+fastparquet