Browse Source

feat: add the diff.py script to properly compare versions of the dataset.

Mika Pflüger 6 months ago
parent
commit
c61e7ff7f8
4 changed files with 63 additions and 10 deletions
  1. 1 0
      .gitignore
  2. 56 0
      diff.py
  3. 5 10
      howto_new_release.md
  4. 1 0
      requirements.txt

+ 1 - 0
.gitignore

@@ -5,3 +5,4 @@ parquet-only.zip
 data/diff/
 data/diff/
 data/old/
 data/old/
 data/new/
 data/new/
+diff/

+ 56 - 0
diff.py

@@ -0,0 +1,56 @@
+"""Compute and summarize differences."""
+
+import os
+import pandas as pd
+import datacompy
+import tqdm
+
+
+dfs = [
+    pd.read_parquet(f"data/old/{i}")
+    for i in os.listdir("data/old")
+    if i.endswith(".parquet")
+]
+old = pd.concat(dfs)
+
+sort_cols = ["party", "category", "classification", "measure", "gas", "unit", "year", "numberValue", "stringValue"]
+old = old.sort_values(
+    sort_cols,
+    ignore_index=True
+)
+
+new = pd.read_parquet("data/all.parquet")
+
+new = new.sort_values(
+    sort_cols,
+    ignore_index=True
+)
+
+join_columns=["party", "category", "classification", "measure", "gas", "year"]
+
+for party in tqdm.tqdm(new["party"].unique()):
+    o = old[old["party"] == party].reset_index(drop=True)
+    n = new[new["party"] == party].reset_index(drop=True)
+    if o.equals(n):
+        continue
+    if len(o) == len(n):
+        diff = o.compare(n)
+        if diff.empty:
+            continue
+
+    print(f"{party} has differences, generating diff/{party}.html")
+
+    o = n.set_index(join_columns, drop=True)
+    n = o.set_index(join_columns, drop=True)
+
+    comp = datacompy.Compare(o, n, on_index=True,
+                             df1_name="old", df2_name="new", cast_column_names_lower=False)
+    comp.report(html_file=f"diff/{party}.html")
+    with open(f"diff/{party}.html", 'a') as fd:
+        fd.write("<h2>Only in old</h2>\n")
+        comp.df1_unq_rows.sort_values(sort_cols).to_html(fd)
+        fd.write("<h2>Only in new</h2>\n")
+        comp.df2_unq_rows.sort_values(sort_cols).to_html(fd)
+        fd.write("<h2>Changed</h2>\n")
+        am = comp.all_mismatch()
+        am.sort_values(list(am.columns)).to_html(fd)

+ 5 - 10
howto_new_release.md

@@ -26,16 +26,11 @@ git pull
 to pull all changes.
 to pull all changes.
 
 
 If you want to compare differences, checkout the base you want to compare against,
 If you want to compare differences, checkout the base you want to compare against,
-move the CSVs from it to a temporary folder called `old`, then checkout the latest
-state again and copy the CSVs to a temporary folder called `new`.
-Then run (in fish):
-
-```fish
-for i in old/*.csv; echo $i; csvdiff --ignore-columns 0 -p 1,2,3,4,5,7 -o word-diff $i  new/$(basename $i) > diff_$(basename $i); end
-```
-
-you need https://github.com/aswinkarthik/csvdiff for that. Afterwards, you can check
-the `diff_{country}.csv` files for changes.
+move the `all.parquet` file from it to a temporary new name, then checkout the latest
+state again.
+Then you can point the `diff.py` script at the files you just checked out and run it
+to generate HTML files which show the differences between the old and the new state.
+Note that if there are no differences, no HTML files are generated.
 
 
 ## 3. release a new version of the data package
 ## 3. release a new version of the data package
 
 

+ 1 - 0
requirements.txt

@@ -4,3 +4,4 @@ treelib
 tqdm
 tqdm
 requests
 requests
 fastparquet
 fastparquet
+datacompy