123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- """Download the data for all countries from the """
- import pandas as pd
- import tqdm
- import os
- import pathlib
- from unfccc_di_api import UNFCCCApiReader
- ROOT_DIR = pathlib.Path(os.path.abspath(os.curdir))
- def main():
- r = UNFCCCApiReader()
- dfs = []
- for party in tqdm.tqdm(r.parties["code"], desc="parties"):
- df = r.query(party_code=party, progress=False)
- annexI = party in r.annex_one_reader.parties["code"].values
- subdir = "annexI" if annexI else "non-annexI"
- directory = ROOT_DIR / "data" / subdir
- directory.mkdir(parents=True, exist_ok=True)
- # CSV compressed with gzip is a very widely used standard
- # pass mtime=0 explicitly so that the creation time is not embedded in the
- # produced gzip file, which means that the gzip file doesn't change if the
- # contents haven't changed.
- df.to_csv(
- directory / f"{party}.csv.gz",
- compression={"method": 'gzip', "mtime": 0}
- )
- dfs.append(df)
- # Save data for all parties into one big file for easy distribution.
- # parquet with zstd compression is a very efficient binary format
- # unfortunately, it embeds the used software versions in the metadata so
- # generally, the parquet file can change even if the contents haven't changed.
- # We disable index and statistics which we don't use.
- df_all = pd.concat(dfs)
- df_all.to_parquet(
- ROOT_DIR / "data" / "all.parquet",
- engine="fastparquet",
- compression="zstd",
- index=False,
- stats=False
- )
- if __name__ == "__main__":
- main()
|