diff.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. """Compute and summarize differences."""
  2. import os
  3. import pandas as pd
  4. import datacompy
  5. import tqdm
  6. dfs = [
  7. pd.read_parquet(f"data/old/{i}")
  8. for i in os.listdir("data/old")
  9. if i.endswith(".parquet")
  10. ]
  11. old = pd.concat(dfs)
  12. sort_cols = ["party", "category", "classification", "measure", "gas", "unit", "year", "numberValue", "stringValue"]
  13. old = old.sort_values(
  14. sort_cols,
  15. ignore_index=True
  16. )
  17. new = pd.read_parquet("data/all.parquet")
  18. new = new.sort_values(
  19. sort_cols,
  20. ignore_index=True
  21. )
  22. join_columns=["party", "category", "classification", "measure", "gas", "year"]
  23. for party in tqdm.tqdm(new["party"].unique()):
  24. o = old[old["party"] == party].reset_index(drop=True)
  25. n = new[new["party"] == party].reset_index(drop=True)
  26. if o.equals(n):
  27. continue
  28. if len(o) == len(n):
  29. diff = o.compare(n)
  30. if diff.empty:
  31. continue
  32. print(f"{party} has differences, generating diff/{party}.html")
  33. o = n.set_index(join_columns, drop=True)
  34. n = o.set_index(join_columns, drop=True)
  35. comp = datacompy.Compare(o, n, on_index=True,
  36. df1_name="old", df2_name="new", cast_column_names_lower=False)
  37. comp.report(html_file=f"diff/{party}.html")
  38. with open(f"diff/{party}.html", 'a') as fd:
  39. fd.write("<h2>Only in old</h2>\n")
  40. comp.df1_unq_rows.sort_values(sort_cols).to_html(fd)
  41. fd.write("<h2>Only in new</h2>\n")
  42. comp.df2_unq_rows.sort_values(sort_cols).to_html(fd)
  43. fd.write("<h2>Changed</h2>\n")
  44. am = comp.all_mismatch()
  45. am.sort_values(list(am.columns)).to_html(fd)