před 9 měsíci · c61e7ff7f8
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ parquet-only.zip
 
															 data/diff/
														
 
															 data/old/
														
 
															 data/new/
														
 
															+diff/
														
--- a/diff.py
+++ b/diff.py
@@ -0,0 +1,56 @@
 
															+"""Compute and summarize differences."""
														
 
															+
														
 
															+import os
														
 
															+import pandas as pd
														
 
															+import datacompy
														
 
															+import tqdm
														
 
															+
														
 
															+
														
 
															+dfs = [
														
 
															+    pd.read_parquet(f"data/old/{i}")
														
 
															+    for i in os.listdir("data/old")
														
 
															+    if i.endswith(".parquet")
														
 
															+]
														
 
															+old = pd.concat(dfs)
														
 
															+
														
 
															+sort_cols = ["party", "category", "classification", "measure", "gas", "unit", "year", "numberValue", "stringValue"]
														
 
															+old = old.sort_values(
														
 
															+    sort_cols,
														
 
															+    ignore_index=True
														
 
															+)
														
 
															+
														
 
															+new = pd.read_parquet("data/all.parquet")
														
 
															+
														
 
															+new = new.sort_values(
														
 
															+    sort_cols,
														
 
															+    ignore_index=True
														
 
															+)
														
 
															+
														
 
															+join_columns=["party", "category", "classification", "measure", "gas", "year"]
														
 
															+
														
 
															+for party in tqdm.tqdm(new["party"].unique()):
														
 
															+    o = old[old["party"] == party].reset_index(drop=True)
														
 
															+    n = new[new["party"] == party].reset_index(drop=True)
														
 
															+    if o.equals(n):
														
 
															+        continue
														
 
															+    if len(o) == len(n):
														
 
															+        diff = o.compare(n)
														
 
															+        if diff.empty:
														
 
															+            continue
														
 
															+
														
 
															+    print(f"{party} has differences, generating diff/{party}.html")
														
 
															+
														
 
															+    o = n.set_index(join_columns, drop=True)
														
 
															+    n = o.set_index(join_columns, drop=True)
														
 
															+
														
 
															+    comp = datacompy.Compare(o, n, on_index=True,
														
 
															+                             df1_name="old", df2_name="new", cast_column_names_lower=False)
														
 
															+    comp.report(html_file=f"diff/{party}.html")
														
 
															+    with open(f"diff/{party}.html", 'a') as fd:
														
 
															+        fd.write("<h2>Only in old</h2>\n")
														
 
															+        comp.df1_unq_rows.sort_values(sort_cols).to_html(fd)
														
 
															+        fd.write("<h2>Only in new</h2>\n")
														
 
															+        comp.df2_unq_rows.sort_values(sort_cols).to_html(fd)
														
 
															+        fd.write("<h2>Changed</h2>\n")
														
 
															+        am = comp.all_mismatch()
														
 
															+        am.sort_values(list(am.columns)).to_html(fd)
														
--- a/howto_new_release.md
+++ b/howto_new_release.md
@@ -26,16 +26,11 @@ git pull
 
															 to pull all changes.
														
 
															 If you want to compare differences, checkout the base you want to compare against,
														
 
															-move the CSVs from it to a temporary folder called `old`, then checkout the latest
														
 
															-state again and copy the CSVs to a temporary folder called `new`.
														
 
															-Then run (in fish):
														
 
															-
														
 
															-```fish
														
 
															-for i in old/*.csv; echo $i; csvdiff --ignore-columns 0 -p 1,2,3,4,5,7 -o word-diff $i  new/$(basename $i) > diff_$(basename $i); end
														
 
															-```
														
 
															-
														
 
															-you need https://github.com/aswinkarthik/csvdiff for that. Afterwards, you can check
														
 
															-the `diff_{country}.csv` files for changes.
														
 
															+move the `all.parquet` file from it to a temporary new name, then checkout the latest
														
 
															+state again.
														
 
															+Then you can point the `diff.py` script at the files you just checked out and run it
														
 
															+to generate HTML files which show the differences between the old and the new state.
														
 
															+Note that if there are no differences, no HTML files are generated.
														
 
															 ## 3. release a new version of the data package
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ treelib
 
															 tqdm
														
 
															 requests
														
 
															 fastparquet
														
 
															+datacompy