download.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """Download the data for all countries from the """
  2. import pandas as pd
  3. import tqdm
  4. import os
  5. import pathlib
  6. from unfccc_di_api import UNFCCCApiReader
  7. ROOT_DIR = pathlib.Path(os.path.abspath(os.curdir))
  8. def main():
  9. r = UNFCCCApiReader()
  10. dfs = []
  11. for party in tqdm.tqdm(r.parties["code"], desc="parties"):
  12. df = r.query(party_code=party, progress=False)
  13. annexI = party in r.annex_one_reader.parties["code"].values
  14. subdir = "annexI" if annexI else "non-annexI"
  15. directory = ROOT_DIR / "data" / subdir
  16. directory.mkdir(parents=True, exist_ok=True)
  17. # CSV compressed with gzip is a very widely used standard
  18. # pass mtime=0 explicitly so that the creation time is not embedded in the
  19. # produced gzip file, which means that the gzip file doesn't change if the
  20. # contents haven't changed.
  21. df.to_csv(
  22. directory / f"{party}.csv.gz",
  23. compression={"method": 'gzip', "mtime": 0}
  24. )
  25. dfs.append(df)
  26. # Save data for all parties into one big file for easy distribution.
  27. # parquet with zstd compression is a very efficient binary format
  28. # unfortunately, it embeds the used software versions in the metadata so
  29. # generally, the parquet file can change even if the contents haven't changed.
  30. # We disable index and statistics which we don't use.
  31. df_all = pd.concat(dfs)
  32. df_all.to_parquet(
  33. ROOT_DIR / "data" / "all.parquet",
  34. engine="fastparquet",
  35. compression="zstd",
  36. index=False,
  37. stats=False
  38. )
  39. if __name__ == "__main__":
  40. main()