|
@@ -1,14 +1,17 @@
|
|
|
+"""Download the data for all countries from the """
|
|
|
+import pandas as pd
|
|
|
import tqdm
|
|
|
import os
|
|
|
import pathlib
|
|
|
|
|
|
from unfccc_di_api import UNFCCCApiReader
|
|
|
|
|
|
-ROOT_DIR = pathlib.Path(os.path.abspath(os.curdir)) # This is your Project Root
|
|
|
+ROOT_DIR = pathlib.Path(os.path.abspath(os.curdir))
|
|
|
|
|
|
|
|
|
def main():
|
|
|
r = UNFCCCApiReader()
|
|
|
+ dfs = []
|
|
|
for party in tqdm.tqdm(r.parties["code"], desc="parties"):
|
|
|
df = r.query(party_code=party, progress=False)
|
|
|
|
|
@@ -16,8 +19,29 @@ def main():
|
|
|
subdir = "annexI" if annexI else "non-annexI"
|
|
|
directory = ROOT_DIR / "data" / subdir
|
|
|
directory.mkdir(parents=True, exist_ok=True)
|
|
|
- df.to_csv(directory / f"{party}.csv.gz", compression="gzip")
|
|
|
- df.to_parquet(directory / f"{party}.parquet", compression="brotli")
|
|
|
+ # CSV compressed with gzip is a very widely used standard
|
|
|
+ # pass mtime=0 explicitly so that the creation time is not embedded in the
|
|
|
+ # produced gzip file, which means that the gzip file doesn't change if the
|
|
|
+ # contents haven't changed.
|
|
|
+ df.to_csv(
|
|
|
+ directory / f"{party}.csv.gz",
|
|
|
+ compression={"method": 'gzip', "mtime": 0}
|
|
|
+ )
|
|
|
+ dfs.append(df)
|
|
|
+
|
|
|
+ # Save data for all parties into one big file for easy distribution.
|
|
|
+ # parquet with zstd compression is a very efficient binary format
|
|
|
+ # unfortunately, it embeds the used software versions in the metadata so
|
|
|
+ # generally, the parquet file can change even if the contents haven't changed.
|
|
|
+ # We disable index and statistics which we don't use.
|
|
|
+ df_all = pd.concat(dfs)
|
|
|
+ df_all.to_parquet(
|
|
|
+ ROOT_DIR / "data" / "all.parquet",
|
|
|
+ engine="fastparquet",
|
|
|
+ compression="zstd",
|
|
|
+ index=False,
|
|
|
+ stats=False
|
|
|
+ )
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|