config_MEX_BUR3.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import pandas as pd
  2. def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
  3. n_rows: int) -> pd.DataFrame:
  4. for row in rows_to_fix:
  5. # print(row)
  6. # find the row number and collect the row and the next two rows
  7. index = data.loc[data[col_to_use] == row].index
  8. if not list(index):
  9. print(f"Can't merge split row {row}")
  10. print(data[col_to_use])
  11. print(f"Merging split row {row}")
  12. indices_to_drop = []
  13. ####print(index)
  14. for item in index:
  15. loc = data.index.get_loc(item)
  16. ####print(data[col_to_use].loc[loc + 1])
  17. if n_rows == -2:
  18. locs_to_merge = list(range(loc - 1, loc + 1))
  19. loc_to_check = loc - 1
  20. if n_rows == -6:
  21. locs_to_merge = list(range(loc - 3, loc + 3))
  22. loc_to_check = loc - 3
  23. elif n_rows == -3:
  24. locs_to_merge = list(range(loc - 1, loc + 2))
  25. loc_to_check = loc - 1
  26. else:
  27. locs_to_merge = list(range(loc, loc + n_rows))
  28. loc_to_check = loc + 1
  29. if (data[col_to_use].loc[loc_to_check] == '') or n_rows == 2:
  30. rows_to_merge = data.iloc[locs_to_merge]
  31. indices_to_merge = rows_to_merge.index
  32. # replace numerical NaN values
  33. ####print(rows_to_merge)
  34. rows_to_merge = rows_to_merge.fillna('')
  35. ####print("fillna")
  36. ####print(rows_to_merge)
  37. # join the three rows
  38. new_row = rows_to_merge.agg(' '.join)
  39. # replace the double spaces that are created
  40. # must be done here and not at the end as splits are not always
  41. # the same and join would produce different col values
  42. new_row = new_row.str.replace(" ", " ")
  43. new_row = new_row.str.strip()
  44. # new_row = new_row.str.replace("N O", "NO")
  45. # new_row = new_row.str.replace(", N", ",N")
  46. # new_row = new_row.str.replace("- ", "-")
  47. data.loc[indices_to_merge[0]] = new_row
  48. indices_to_drop = indices_to_drop + list(indices_to_merge[1:])
  49. data = data.drop(indices_to_drop)
  50. data = data.reset_index(drop=True)
  51. return data
  52. page_defs = {
  53. '118': {
  54. "camelot": {
  55. "table_areas": ['49,602,551,73'],
  56. "columns": ['223,277,314,348,392,422,446,483'],
  57. "split_text": False,
  58. "flavor": "stream",
  59. },
  60. "rows_to_fix": {
  61. -6: ["Categorías de fuentes y"],
  62. 3: ["Todas las emisiones y las absorciones",
  63. "Todas las emisiones (sin [3B] Tierra ni",
  64. "[1A] Actividades de quema del",
  65. "[1A2] Industrias manufactura y de la",
  66. "[1B] Emisiones fugitivas provenientes de",
  67. "[2] Procesos industriales y uso de"],
  68. },
  69. },
  70. '119': {
  71. "camelot": {
  72. "table_areas": ['49,650,551,77'],
  73. "columns": ['228,275,317,352,394,421,446,483'],
  74. "split_text": True,
  75. "flavor": "stream",
  76. },
  77. "rows_to_fix": {
  78. -6: ["Categorías de fuentes y"],
  79. 3: ["[2B4] Producción de caprolactama,",
  80. "[2B8] Producción petroquímica y negro",
  81. "[2D] Uso de productos no energéticos de",
  82. "[2E1] Circuitos integrados o"],
  83. },
  84. },
  85. '120': {
  86. "camelot": {
  87. "table_areas": ['49,650,551,77'],
  88. "columns": ['223,277,314,348,392,422,446,483'],
  89. "split_text": False,
  90. "flavor": "stream",
  91. },
  92. "rows_to_fix": {
  93. -6: ["Categorías de fuentes y"],
  94. -3: ["[3B] Tierra"],
  95. 3: ["[2F] Uso de productos sustitutos de las",
  96. "[2G] Manufactura y utilización de otros",
  97. "[3] Agricultura, silvicultura y otros usos"],
  98. 2: ["[2H2] Industria de la alimentación y las",
  99. "[2G2] SF₆ y PFC de otros usos de"],
  100. },
  101. },
  102. '121': {
  103. "camelot": {
  104. "table_areas": ['49,650,551,70'],
  105. "columns": ['223,277,314,348,392,422,446,483'],
  106. "split_text": False,
  107. "flavor": "stream",
  108. },
  109. "rows_to_fix": {
  110. -6: ["Categorías de fuentes y"],
  111. -3: ["[3B1] Tierra forestales"],
  112. 3: ["[3C] Fuentes agregadas y fuentes de",
  113. "[3C1] Emisiones de GEI por quemado de",
  114. "[3C4] Emisiones directas de los N₂O de",
  115. "[3C5] Emisiones indirectas de los N₂O de",
  116. "[3C6] Emisiones indirectas de los N₂O de",
  117. "[4A1] Sitios gestionados de eliminación",
  118. "[4A2] Sitios no controlados de",
  119. "[4A3] Tiraderos a cielo abierto para",
  120. "[4B] Tratamiento biológico de los",
  121. ],
  122. },
  123. },
  124. '122': {
  125. "camelot": {
  126. "table_areas": ['49,650,551,404'],
  127. "columns": ['223,277,314,348,392,422,446,483'],
  128. "split_text": False,
  129. "flavor": "stream",
  130. },
  131. "rows_to_fix": {
  132. -6: ["Categorías de fuentes y"],
  133. 3: ["[4C] Incineración y quema a cielo abierto",
  134. "[4C1] Incineración de residuos peligrosos",
  135. "[4C2] Quema a cielo abierto de residuos",
  136. "[4D] Tratamiento y eliminación de aguas",
  137. "[4D1] Tratamiento y eliminación de",
  138. "[4D2] Tratamiento y eliminación de"],
  139. },
  140. },
  141. }