소스 검색

feat: add the diff.py script to properly compare versions of the dataset.

Mika Pflüger 6 달 전
부모
커밋
c61e7ff7f8
4개의 변경된 파일63개의 추가작업 그리고 10개의 파일을 삭제
  1. 1 0
      .gitignore
  2. 56 0
      diff.py
  3. 5 10
      howto_new_release.md
  4. 1 0
      requirements.txt

+ 1 - 0
.gitignore

@@ -5,3 +5,4 @@ parquet-only.zip
 data/diff/
 data/old/
 data/new/
+diff/

+ 56 - 0
diff.py

@@ -0,0 +1,56 @@
+"""Compute and summarize differences."""
+
+import os
+import pandas as pd
+import datacompy
+import tqdm
+
+
+dfs = [
+    pd.read_parquet(f"data/old/{i}")
+    for i in os.listdir("data/old")
+    if i.endswith(".parquet")
+]
+old = pd.concat(dfs)
+
+sort_cols = ["party", "category", "classification", "measure", "gas", "unit", "year", "numberValue", "stringValue"]
+old = old.sort_values(
+    sort_cols,
+    ignore_index=True
+)
+
+new = pd.read_parquet("data/all.parquet")
+
+new = new.sort_values(
+    sort_cols,
+    ignore_index=True
+)
+
+join_columns=["party", "category", "classification", "measure", "gas", "year"]
+
+for party in tqdm.tqdm(new["party"].unique()):
+    o = old[old["party"] == party].reset_index(drop=True)
+    n = new[new["party"] == party].reset_index(drop=True)
+    if o.equals(n):
+        continue
+    if len(o) == len(n):
+        diff = o.compare(n)
+        if diff.empty:
+            continue
+
+    print(f"{party} has differences, generating diff/{party}.html")
+
+    o = n.set_index(join_columns, drop=True)
+    n = o.set_index(join_columns, drop=True)
+
+    comp = datacompy.Compare(o, n, on_index=True,
+                             df1_name="old", df2_name="new", cast_column_names_lower=False)
+    comp.report(html_file=f"diff/{party}.html")
+    with open(f"diff/{party}.html", 'a') as fd:
+        fd.write("<h2>Only in old</h2>\n")
+        comp.df1_unq_rows.sort_values(sort_cols).to_html(fd)
+        fd.write("<h2>Only in new</h2>\n")
+        comp.df2_unq_rows.sort_values(sort_cols).to_html(fd)
+        fd.write("<h2>Changed</h2>\n")
+        am = comp.all_mismatch()
+        am.sort_values(list(am.columns)).to_html(fd)

+ 5 - 10
howto_new_release.md

@@ -26,16 +26,11 @@ git pull
 to pull all changes.
 
 If you want to compare differences, checkout the base you want to compare against,
-move the CSVs from it to a temporary folder called `old`, then checkout the latest
-state again and copy the CSVs to a temporary folder called `new`.
-Then run (in fish):
-
-```fish
-for i in old/*.csv; echo $i; csvdiff --ignore-columns 0 -p 1,2,3,4,5,7 -o word-diff $i  new/$(basename $i) > diff_$(basename $i); end
-```
-
-you need https://github.com/aswinkarthik/csvdiff for that. Afterwards, you can check
-the `diff_{country}.csv` files for changes.
+move the `all.parquet` file from it to a temporary new name, then checkout the latest
+state again.
+Then you can point the `diff.py` script at the files you just checked out and run it
+to generate HTML files which show the differences between the old and the new state.
+Note that if there are no differences, no HTML files are generated.
 
 ## 3. release a new version of the data package
 

+ 1 - 0
requirements.txt

@@ -4,3 +4,4 @@ treelib
 tqdm
 requests
 fastparquet
+datacompy