test_download.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. import pytest
  2. import requests
  3. from src.faostat_data_primap.download import (
  4. calculate_checksum,
  5. download_methodology,
  6. find_previous_release_path,
  7. )
  8. @pytest.fixture
  9. def temp_domain_directories(tmp_path):
  10. """
  11. Sets up a temporary directory structure for domains and releases for testing.
  12. Parameters
  13. ----------
  14. tmp_path : pathlib.Path
  15. A pytest-provided temporary directory path.
  16. Returns
  17. -------
  18. dict
  19. A dictionary containing the paths to the `downloaded_data` directory,
  20. the specific domain directory, and a list of sorted release paths.
  21. """
  22. downloaded_data = tmp_path / "downloaded_data"
  23. downloaded_data.mkdir()
  24. domains = (
  25. "farm_gate_emissions_crops",
  26. "farm_gate_livestock",
  27. "farm_gate_agriculture_energy",
  28. "land_use_forests",
  29. "land_use_fires",
  30. "land_use_drained_organic_soils",
  31. "pre_post_agricultural_production",
  32. )
  33. domain_paths = []
  34. for domain in domains:
  35. domain_path = downloaded_data / domain
  36. domain_path.mkdir()
  37. domain_paths.append(domain_path)
  38. return {
  39. "downloaded_data": downloaded_data,
  40. "domain_paths": domain_paths,
  41. }
  42. @pytest.mark.parametrize(
  43. "releases," "current_release_date, " "expected_result_date",
  44. [
  45. pytest.param(
  46. ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"],
  47. "2024-11-29",
  48. "2024-11-09",
  49. id="current release is latest release",
  50. ),
  51. pytest.param(
  52. ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"],
  53. "2023-12-13",
  54. "2022-03-18",
  55. id="current somewhere not the latest release",
  56. ),
  57. ],
  58. )
  59. def test_find_previous_release_path_exists(
  60. temp_domain_directories, releases, current_release_date, expected_result_date
  61. ):
  62. domain_path = temp_domain_directories["domain_paths"][
  63. 0
  64. ] # farm_gate_emissions_crops
  65. current_release_path = domain_path / current_release_date
  66. expected_result = domain_path / expected_result_date
  67. release_paths = []
  68. for release in releases:
  69. release_path = domain_path / release
  70. release_path.mkdir()
  71. release_paths.append(release_path)
  72. result = find_previous_release_path(current_release_path)
  73. assert result == expected_result
  74. @pytest.mark.parametrize(
  75. "releases,current_release_date",
  76. [
  77. pytest.param(
  78. ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"],
  79. "2022-03-18",
  80. id="current release is oldest release",
  81. ),
  82. pytest.param(
  83. ["2024-11-09"], "2024-11-09", id="current release is only release"
  84. ),
  85. ],
  86. )
  87. def test_find_previous_release_path_that_does_not_exists(
  88. temp_domain_directories, releases, current_release_date
  89. ):
  90. domain_path = temp_domain_directories["domain_paths"][
  91. 0
  92. ] # farm_gate_emissions_crops
  93. current_release_path = domain_path / current_release_date
  94. release_paths = []
  95. for release in releases:
  96. release_path = domain_path / release
  97. release_path.mkdir()
  98. release_paths.append(release_path)
  99. result = find_previous_release_path(current_release_path)
  100. assert not result
  101. @pytest.mark.parametrize(
  102. "releases,current_release_date",
  103. [
  104. pytest.param(
  105. ["2023-12-13", "2022-03-18", "2024-11-29", "20240-11-09"],
  106. "2022-03-18",
  107. id="typo",
  108. ),
  109. pytest.param(
  110. ["20231213", "2022-03-18", "2024-11-29", "2024-11-09"],
  111. "2022-03-18",
  112. id="missing hyphen",
  113. ),
  114. ],
  115. )
  116. def test_find_previous_release_path_wrong_dir_format(
  117. temp_domain_directories, releases, current_release_date
  118. ):
  119. domain_path = temp_domain_directories["domain_paths"][
  120. 0
  121. ] # farm_gate_emissions_crops
  122. current_release_path = domain_path / current_release_date
  123. release_paths = []
  124. for release in releases:
  125. release_path = domain_path / release
  126. release_path.mkdir()
  127. release_paths.append(release_path)
  128. with pytest.raises(ValueError) as excinfo:
  129. result = find_previous_release_path(current_release_path) # noqa: F841
  130. assert str(excinfo.value) == "All release folders must be in YYYY-MM-DD format"
  131. def test_calculate_checksum(tmp_path):
  132. filepath_a = tmp_path / "test_file_a.txt"
  133. with open(filepath_a, "w") as f:
  134. f.write("content of file a")
  135. filepath_b = tmp_path / "test_file_b.txt"
  136. with open(filepath_b, "w") as f:
  137. f.write("content of file a")
  138. filepath_c = tmp_path / "test_file_c.txt"
  139. with open(filepath_c, "w") as f:
  140. f.write("content of file c")
  141. checksum_a = calculate_checksum(filepath_a)
  142. checksum_b = calculate_checksum(filepath_b)
  143. checksum_c = calculate_checksum(filepath_c)
  144. assert checksum_a == checksum_b
  145. assert checksum_b != checksum_c
  146. def test_file_exists_in_previous_release_and_is_the_same(temp_domain_directories):
  147. # set up temporary directories
  148. downloaded_data_path = temp_domain_directories["downloaded_data"]
  149. domain_path = temp_domain_directories["domain_paths"][
  150. 0
  151. ] # farm_gate_emissions_crops
  152. # make folders for different releases
  153. for release in ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"]:
  154. release_path = domain_path / release
  155. release_path.mkdir()
  156. file_to_compare_path = domain_path / "2024-11-09" / "GCE_e.pdf"
  157. response = requests.get(
  158. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf",
  159. stream=True,
  160. timeout=30,
  161. )
  162. response.raise_for_status() # Check for successful request
  163. with open(file_to_compare_path, "wb") as f:
  164. f.write(response.content)
  165. save_path = downloaded_data_path / "farm_gate_emissions_crops" / "2024-11-29"
  166. download_methodology(
  167. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf", save_path=save_path
  168. )
  169. downloaded_file_path = domain_path / "2024-11-29" / "GCE_e.pdf"
  170. assert downloaded_file_path.is_symlink()
  171. def test_methodology_document_exists_in_previous_release_but_is_different(
  172. temp_domain_directories,
  173. ):
  174. # set up temporary directories
  175. domain_path = temp_domain_directories["domain_paths"][
  176. 0
  177. ] # farm_gate_emissions_crops
  178. # make folders for different releases
  179. for release in ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"]:
  180. release_path = domain_path / release
  181. release_path.mkdir()
  182. file_to_compare_path = domain_path / "2024-11-09" / "GCE_e.pdf"
  183. with open(file_to_compare_path, "wb") as f:
  184. s = "hi"
  185. f.write(s.encode("utf-8"))
  186. save_path = domain_path / "2024-11-29"
  187. download_methodology(
  188. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf", save_path=save_path
  189. )
  190. downloaded_file_path = domain_path / "2024-11-29" / "GCE_e.pdf"
  191. assert downloaded_file_path.exists()
  192. def test_methodology_document_does_not_exist_in_previous_release(
  193. temp_domain_directories,
  194. ):
  195. # set up temporary directories
  196. domain_path = temp_domain_directories["domain_paths"][
  197. 0
  198. ] # farm_gate_emissions_crops
  199. # make folders for different releases
  200. for release in ["2023-12-13", "2022-03-18", "2024-11-29", "2024-11-09"]:
  201. release_path = domain_path / release
  202. release_path.mkdir()
  203. save_path = domain_path / "2024-11-29"
  204. download_methodology(
  205. "https://files-faostat.fao.org/production/GCE/GCE_e.pdf", save_path=save_path
  206. )
  207. downloaded_file_path = domain_path / "2024-11-29" / "GCE_e.pdf"
  208. assert downloaded_file_path.exists()