há 9 meses atrás · c61e7ff7f8
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ parquet-only.zip
 
				 data/diff/
			
 
				 data/old/
			
 
				 data/new/
			
 
				+diff/
			
--- a/diff.py
+++ b/diff.py
@@ -0,0 +1,56 @@
 
				+"""Compute and summarize differences."""
			
 
				+
			
 
				+import os
			
 
				+import pandas as pd
			
 
				+import datacompy
			
 
				+import tqdm
			
 
				+
			
 
				+
			
 
				+dfs = [
			
 
				+    pd.read_parquet(f"data/old/{i}")
			
 
				+    for i in os.listdir("data/old")
			
 
				+    if i.endswith(".parquet")
			
 
				+]
			
 
				+old = pd.concat(dfs)
			
 
				+
			
 
				+sort_cols = ["party", "category", "classification", "measure", "gas", "unit", "year", "numberValue", "stringValue"]
			
 
				+old = old.sort_values(
			
 
				+    sort_cols,
			
 
				+    ignore_index=True
			
 
				+)
			
 
				+
			
 
				+new = pd.read_parquet("data/all.parquet")
			
 
				+
			
 
				+new = new.sort_values(
			
 
				+    sort_cols,
			
 
				+    ignore_index=True
			
 
				+)
			
 
				+
			
 
				+join_columns=["party", "category", "classification", "measure", "gas", "year"]
			
 
				+
			
 
				+for party in tqdm.tqdm(new["party"].unique()):
			
 
				+    o = old[old["party"] == party].reset_index(drop=True)
			
 
				+    n = new[new["party"] == party].reset_index(drop=True)
			
 
				+    if o.equals(n):
			
 
				+        continue
			
 
				+    if len(o) == len(n):
			
 
				+        diff = o.compare(n)
			
 
				+        if diff.empty:
			
 
				+            continue
			
 
				+
			
 
				+    print(f"{party} has differences, generating diff/{party}.html")
			
 
				+
			
 
				+    o = n.set_index(join_columns, drop=True)
			
 
				+    n = o.set_index(join_columns, drop=True)
			
 
				+
			
 
				+    comp = datacompy.Compare(o, n, on_index=True,
			
 
				+                             df1_name="old", df2_name="new", cast_column_names_lower=False)
			
 
				+    comp.report(html_file=f"diff/{party}.html")
			
 
				+    with open(f"diff/{party}.html", 'a') as fd:
			
 
				+        fd.write("<h2>Only in old</h2>\n")
			
 
				+        comp.df1_unq_rows.sort_values(sort_cols).to_html(fd)
			
 
				+        fd.write("<h2>Only in new</h2>\n")
			
 
				+        comp.df2_unq_rows.sort_values(sort_cols).to_html(fd)
			
 
				+        fd.write("<h2>Changed</h2>\n")
			
 
				+        am = comp.all_mismatch()
			
 
				+        am.sort_values(list(am.columns)).to_html(fd)
			
--- a/howto_new_release.md
+++ b/howto_new_release.md
@@ -26,16 +26,11 @@ git pull
 
				 to pull all changes.
			
 
				 
			
 
				 If you want to compare differences, checkout the base you want to compare against,
			
 
				-move the CSVs from it to a temporary folder called `old`, then checkout the latest
			
 
				-state again and copy the CSVs to a temporary folder called `new`.
			
 
				-Then run (in fish):
			
 
				-
			
 
				-```fish
			
 
				-for i in old/*.csv; echo $i; csvdiff --ignore-columns 0 -p 1,2,3,4,5,7 -o word-diff $i  new/$(basename $i) > diff_$(basename $i); end
			
 
				-```
			
 
				-
			
 
				-you need https://github.com/aswinkarthik/csvdiff for that. Afterwards, you can check
			
 
				-the `diff_{country}.csv` files for changes.
			
 
				+move the `all.parquet` file from it to a temporary new name, then checkout the latest
			
 
				+state again.
			
 
				+Then you can point the `diff.py` script at the files you just checked out and run it
			
 
				+to generate HTML files which show the differences between the old and the new state.
			
 
				+Note that if there are no differences, no HTML files are generated.
			
 
				 
			
 
				 ## 3. release a new version of the data package
			
 
				 
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ treelib
 
				 tqdm
			
 
				 requests
			
 
				 fastparquet
			
 
				+datacompy