123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- import pandas as pd
- def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
- n_rows: int) -> pd.DataFrame:
- for row in rows_to_fix:
- # print(row)
- # find the row number and collect the row and the next two rows
- index = data.loc[data[col_to_use] == row].index
- if not list(index):
- print(f"Can't merge split row {row}")
- print(data[col_to_use])
- print(f"Merging split row {row}")
- indices_to_drop = []
- ####print(index)
- for item in index:
- loc = data.index.get_loc(item)
- ####print(data[col_to_use].loc[loc + 1])
- if n_rows == -2:
- locs_to_merge = list(range(loc - 1, loc + 1))
- loc_to_check = loc - 1
- if n_rows == -6:
- locs_to_merge = list(range(loc - 3, loc + 3))
- loc_to_check = loc - 3
- elif n_rows == -3:
- locs_to_merge = list(range(loc - 1, loc + 2))
- loc_to_check = loc - 1
- else:
- locs_to_merge = list(range(loc, loc + n_rows))
- loc_to_check = loc + 1
- if (data[col_to_use].loc[loc_to_check] == '') or n_rows == 2:
- rows_to_merge = data.iloc[locs_to_merge]
- indices_to_merge = rows_to_merge.index
- # replace numerical NaN values
- ####print(rows_to_merge)
- rows_to_merge = rows_to_merge.fillna('')
- ####print("fillna")
- ####print(rows_to_merge)
- # join the three rows
- new_row = rows_to_merge.agg(' '.join)
- # replace the double spaces that are created
- # must be done here and not at the end as splits are not always
- # the same and join would produce different col values
- new_row = new_row.str.replace(" ", " ")
- new_row = new_row.str.strip()
- # new_row = new_row.str.replace("N O", "NO")
- # new_row = new_row.str.replace(", N", ",N")
- # new_row = new_row.str.replace("- ", "-")
- data.loc[indices_to_merge[0]] = new_row
- indices_to_drop = indices_to_drop + list(indices_to_merge[1:])
- data = data.drop(indices_to_drop)
- data = data.reset_index(drop=True)
- return data
- page_defs = {
- '118': {
- "camelot": {
- "table_areas": ['49,602,551,73'],
- "columns": ['223,277,314,348,392,422,446,483'],
- "split_text": False,
- "flavor": "stream",
- },
- "rows_to_fix": {
- -6: ["Categorías de fuentes y"],
- 3: ["Todas las emisiones y las absorciones",
- "Todas las emisiones (sin [3B] Tierra ni",
- "[1A] Actividades de quema del",
- "[1A2] Industrias manufactura y de la",
- "[1B] Emisiones fugitivas provenientes de",
- "[2] Procesos industriales y uso de"],
- },
- },
- '119': {
- "camelot": {
- "table_areas": ['49,650,551,77'],
- "columns": ['228,275,317,352,394,421,446,483'],
- "split_text": True,
- "flavor": "stream",
- },
- "rows_to_fix": {
- -6: ["Categorías de fuentes y"],
- 3: ["[2B4] Producción de caprolactama,",
- "[2B8] Producción petroquímica y negro",
- "[2D] Uso de productos no energéticos de",
- "[2E1] Circuitos integrados o"],
- },
- },
- '120': {
- "camelot": {
- "table_areas": ['49,650,551,77'],
- "columns": ['223,277,314,348,392,422,446,483'],
- "split_text": False,
- "flavor": "stream",
- },
- "rows_to_fix": {
- -6: ["Categorías de fuentes y"],
- -3: ["[3B] Tierra"],
- 3: ["[2F] Uso de productos sustitutos de las",
- "[2G] Manufactura y utilización de otros",
- "[3] Agricultura, silvicultura y otros usos"],
- 2: ["[2H2] Industria de la alimentación y las",
- "[2G2] SF₆ y PFC de otros usos de"],
- },
- },
- '121': {
- "camelot": {
- "table_areas": ['49,650,551,70'],
- "columns": ['223,277,314,348,392,422,446,483'],
- "split_text": False,
- "flavor": "stream",
- },
- "rows_to_fix": {
- -6: ["Categorías de fuentes y"],
- -3: ["[3B1] Tierra forestales"],
- 3: ["[3C] Fuentes agregadas y fuentes de",
- "[3C1] Emisiones de GEI por quemado de",
- "[3C4] Emisiones directas de los N₂O de",
- "[3C5] Emisiones indirectas de los N₂O de",
- "[3C6] Emisiones indirectas de los N₂O de",
- "[4A1] Sitios gestionados de eliminación",
- "[4A2] Sitios no controlados de",
- "[4A3] Tiraderos a cielo abierto para",
- "[4B] Tratamiento biológico de los",
- ],
- },
- },
- '122': {
- "camelot": {
- "table_areas": ['49,650,551,404'],
- "columns": ['223,277,314,348,392,422,446,483'],
- "split_text": False,
- "flavor": "stream",
- },
- "rows_to_fix": {
- -6: ["Categorías de fuentes y"],
- 3: ["[4C] Incineración y quema a cielo abierto",
- "[4C1] Incineración de residuos peligrosos",
- "[4C2] Quema a cielo abierto de residuos",
- "[4D] Tratamiento y eliminación de aguas",
- "[4D1] Tratamiento y eliminación de",
- "[4D2] Tratamiento y eliminación de"],
- },
- },
- }
|