test_download.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. import pytest
  2. import requests
  3. from src.faostat_data_primap.download import (
  4. calculate_checksum,
  5. download_methodology,
  6. find_previous_release_path,
  7. )
  8. @pytest.fixture
  9. def temp_domain_directories(tmp_path):
  10. """
  11. Sets up a temporary directory structure for domains and releases for testing.
  12. Parameters
  13. ----------
  14. tmp_path : pathlib.Path
  15. A pytest-provided temporary directory path.
  16. Returns
  17. -------
  18. dict
  19. A dictionary containing the paths to the `downloaded_data` directory,
  20. the specific domain directory, and a list of sorted release paths.
  21. """
  22. downloaded_data = tmp_path / "downloaded_data"
  23. downloaded_data.mkdir()
  24. domains = (
  25. "farm_gate_emissions_crops",
  26. "farm_gate_livestock",
  27. "farm_gate_agriculture_energy",
  28. "land_use_forests",
  29. "land_use_fires",
  30. "land_use_drained_organic_soils",
  31. "pre_post_agricultural_production",
  32. )
  33. domain_paths = []
  34. for domain in domains:
  35. domain_path = downloaded_data / domain
  36. domain_path.mkdir()
  37. domain_paths.append(domain_path)
  38. return {
  39. "downloaded_data": downloaded_data,
  40. "domain_paths": domain_paths,
  41. }
  42. @pytest.mark.parametrize(
  43. "releases," "current_release_date, " "expected_result_date",
  44. [
  45. pytest.param(
  46. ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"],
  47. "2024-11-29",
  48. "2024-11-09",
  49. id="current release is latest release",
  50. ),
  51. pytest.param(
  52. ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"],
  53. "2023-12-13",
  54. "2022-03-18",
  55. id="current somewhere not the latest release",
  56. ),
  57. ],
  58. )
  59. def test_find_previous_release_path_exists(
  60. temp_domain_directories, releases, current_release_date, expected_result_date
  61. ):
  62. domain_path = temp_domain_directories["domain_paths"][
  63. 0
  64. ] # farm_gate_emissions_crops
  65. current_release_path = domain_path / current_release_date
  66. expected_result = domain_path / expected_result_date
  67. release_paths = []
  68. for release in releases:
  69. release_path = domain_path / release
  70. release_path.mkdir()
  71. release_paths.append(release_path)
  72. result = find_previous_release_path(current_release_path)
  73. assert result == expected_result
  74. @pytest.mark.parametrize(
  75. "releases,current_release_date",
  76. [
  77. pytest.param(
  78. ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"],
  79. "2022-03-18",
  80. id="current release is oldest release",
  81. ),
  82. pytest.param(
  83. ["2024-11-09"], "2024-11-09", id="current release is only release"
  84. ),
  85. ],
  86. )
  87. def test_find_previous_release_path_that_does_not_exists(
  88. temp_domain_directories, releases, current_release_date
  89. ):
  90. domain_path = temp_domain_directories["domain_paths"][
  91. 0
  92. ] # farm_gate_emissions_crops
  93. current_release_path = domain_path / current_release_date
  94. release_paths = []
  95. for release in releases:
  96. release_path = domain_path / release
  97. release_path.mkdir()
  98. release_paths.append(release_path)
  99. result = find_previous_release_path(current_release_path)
  100. assert not result
  101. @pytest.mark.parametrize(
  102. "releases,current_release_date, error_msg",
  103. [
  104. pytest.param(
  105. ["2023-12-13", "2022-03-18", "2024-11-29", "20240-11-09"],
  106. "2022-03-18",
  107. (
  108. "All release folders must be in YYYY-MM-DD format, got "
  109. "['2022-03-18', '2023-12-13', '2024-11-29', '20240-11-09']"
  110. ),
  111. id="typo",
  112. ),
  113. pytest.param(
  114. ["20231213", "2022-03-18", "2024-11-29", "2024-11-09"],
  115. "2022-03-18",
  116. (
  117. "All release folders must be in YYYY-MM-DD format, got "
  118. "['2022-03-18', '20231213', '2024-11-09', '2024-11-29']"
  119. ),
  120. id="missing hyphen",
  121. ),
  122. ],
  123. )
  124. def test_find_previous_release_path_wrong_dir_format(
  125. temp_domain_directories, releases, current_release_date, error_msg
  126. ):
  127. domain_path = temp_domain_directories["domain_paths"][
  128. 0
  129. ] # farm_gate_emissions_crops
  130. current_release_path = domain_path / current_release_date
  131. release_paths = []
  132. for release in releases:
  133. release_path = domain_path / release
  134. release_path.mkdir()
  135. release_paths.append(release_path)
  136. with pytest.raises(ValueError) as excinfo:
  137. result = find_previous_release_path(current_release_path) # noqa: F841
  138. assert str(excinfo.value) == error_msg
  139. def test_calculate_checksum(tmp_path):
  140. filepath_a = tmp_path / "test_file_a.txt"
  141. with open(filepath_a, "w") as f:
  142. f.write("content of file a")
  143. filepath_b = tmp_path / "test_file_b.txt"
  144. with open(filepath_b, "w") as f:
  145. f.write("content of file a")
  146. filepath_c = tmp_path / "test_file_c.txt"
  147. with open(filepath_c, "w") as f:
  148. f.write("content of file c")
  149. checksum_a = calculate_checksum(filepath_a)
  150. checksum_b = calculate_checksum(filepath_b)
  151. checksum_c = calculate_checksum(filepath_c)
  152. assert checksum_a == checksum_b
  153. assert checksum_b != checksum_c
  154. def test_file_exists_in_previous_release_and_is_the_same(temp_domain_directories):
  155. # set up temporary directories
  156. downloaded_data_path = temp_domain_directories["downloaded_data"]
  157. domain_path = temp_domain_directories["domain_paths"][
  158. 0
  159. ] # farm_gate_emissions_crops
  160. # make folders for different releases
  161. for release in ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"]:
  162. release_path = domain_path / release
  163. release_path.mkdir()
  164. file_to_compare_path = domain_path / "2024-11-09" / "GCE_e.pdf"
  165. response = requests.get(
  166. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf",
  167. stream=True,
  168. timeout=30,
  169. )
  170. response.raise_for_status() # Check for successful request
  171. with open(file_to_compare_path, "wb") as f:
  172. f.write(response.content)
  173. save_path = downloaded_data_path / "farm_gate_emissions_crops" / "2024-11-29"
  174. download_methodology(
  175. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf", save_path=save_path
  176. )
  177. downloaded_file_path = domain_path / "2024-11-29" / "GCE_e.pdf"
  178. assert downloaded_file_path.is_symlink()
  179. def test_methodology_document_exists_in_previous_release_but_is_different(
  180. temp_domain_directories,
  181. ):
  182. # set up temporary directories
  183. domain_path = temp_domain_directories["domain_paths"][
  184. 0
  185. ] # farm_gate_emissions_crops
  186. # make folders for different releases
  187. for release in ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"]:
  188. release_path = domain_path / release
  189. release_path.mkdir()
  190. file_to_compare_path = domain_path / "2024-11-09" / "GCE_e.pdf"
  191. with open(file_to_compare_path, "wb") as f:
  192. s = "hi"
  193. f.write(s.encode("utf-8"))
  194. save_path = domain_path / "2024-11-29"
  195. download_methodology(
  196. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf", save_path=save_path
  197. )
  198. downloaded_file_path = domain_path / "2024-11-29" / "GCE_e.pdf"
  199. assert downloaded_file_path.exists()
  200. def test_methodology_document_does_not_exist_in_previous_release(
  201. temp_domain_directories,
  202. ):
  203. # set up temporary directories
  204. domain_path = temp_domain_directories["domain_paths"][
  205. 0
  206. ] # farm_gate_emissions_crops
  207. # make folders for different releases
  208. for release in ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"]:
  209. release_path = domain_path / release
  210. release_path.mkdir()
  211. save_path = domain_path / "2024-11-29"
  212. download_methodology(
  213. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf", save_path=save_path
  214. )
  215. downloaded_file_path = domain_path / "2024-11-29" / "GCE_e.pdf"
  216. assert downloaded_file_path.exists()