Explorar o código

scrape last updated tag from domain website

Daniel Busch hai 5 meses
pai
achega
2b893048da

+ 0 - 1
downloaded_data/farm_gate_agriculture_energy/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/4m/p0/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip

+ 0 - 1
downloaded_data/farm_gate_emissions_crops/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/mG/4x/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip

+ 0 - 1
downloaded_data/farm_gate_livestock/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/1M/0X/MD5E-s33910537--28bb5f9131517238e7a112e6871d5898.zip/MD5E-s33910537--28bb5f9131517238e7a112e6871d5898.zip

+ 0 - 1
downloaded_data/land_use_drained_organic_soils/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/FK/vp/MD5E-s387900--a022e0142fa658793302f93ce4820f51.zip/MD5E-s387900--a022e0142fa658793302f93ce4820f51.zip

+ 0 - 1
downloaded_data/land_use_fires/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/8j/pF/MD5E-s1749124--dc55869803658e9cd776b60ae24107eb.zip/MD5E-s1749124--dc55869803658e9cd776b60ae24107eb.zip

+ 0 - 1
downloaded_data/land_use_forests/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/km/WQ/MD5E-s177332--a1265221e2763f2048a45f727864166e.zip/MD5E-s177332--a1265221e2763f2048a45f727864166e.zip

+ 0 - 1
downloaded_data/pre_post_agricultural_production/2024-11-11/test.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/0z/JM/MD5E-s4159211--d529d764b672c1b778d13a7ca2cc3d13.zip/MD5E-s4159211--d529d764b672c1b778d13a7ca2cc3d13.zip

+ 178 - 1
poetry.lock

@@ -826,6 +826,17 @@ files = [
 docs = ["Sphinx", "furo"]
 test = ["objgraph", "psutil"]
 
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
 [[package]]
 name = "humanize"
 version = "4.11.0"
@@ -1732,6 +1743,20 @@ files = [
     {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
 ]
 
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+description = "Capture the outcome of Python function calls."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
+    {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
+]
+
+[package.dependencies]
+attrs = ">=19.2.0"
+
 [[package]]
 name = "packaging"
 version = "24.1"
@@ -1957,6 +1982,18 @@ files = [
 [package.extras]
 windows-terminal = ["colorama (>=0.4.6)"]
 
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+    {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+    {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+
 [[package]]
 name = "pytest"
 version = "7.4.4"
@@ -2011,6 +2048,20 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "python-dotenv"
+version = "1.0.1"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
+    {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "python-gitlab"
 version = "5.0.0"
@@ -2442,6 +2493,25 @@ files = [
 cryptography = ">=2.0"
 jeepney = ">=0.6"
 
+[[package]]
+name = "selenium"
+version = "4.26.1"
+description = "Official Python bindings for Selenium WebDriver"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "selenium-4.26.1-py3-none-any.whl", hash = "sha256:1db3f3a0cd5bb07624fa8a3905a6fdde1595a42185a0617077c361dc53d104fb"},
+    {file = "selenium-4.26.1.tar.gz", hash = "sha256:7640f3f08ae7f4e450f895678e8a10a55eb4e4ca18311ed675ecc4684b96b683"},
+]
+
+[package.dependencies]
+certifi = ">=2021.10.8"
+trio = ">=0.17,<1.0"
+trio-websocket = ">=0.9,<1.0"
+typing_extensions = ">=4.9,<5.0"
+urllib3 = {version = ">=1.26,<3", extras = ["socks"]}
+websocket-client = ">=1.8,<2.0"
+
 [[package]]
 name = "semantic-version"
 version = "2.10.0"
@@ -2483,6 +2553,17 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+    {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
 [[package]]
 name = "snowballstemmer"
 version = "2.2.0"
@@ -2494,6 +2575,17 @@ files = [
     {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
 ]
 
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+
 [[package]]
 name = "soupsieve"
 version = "2.6"
@@ -2929,6 +3021,42 @@ files = [
 docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
 test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"]
 
+[[package]]
+name = "trio"
+version = "0.27.0"
+description = "A friendly Python library for async concurrency and I/O"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "trio-0.27.0-py3-none-any.whl", hash = "sha256:68eabbcf8f457d925df62da780eff15ff5dc68fd6b367e2dde59f7aaf2a0b884"},
+    {file = "trio-0.27.0.tar.gz", hash = "sha256:1dcc95ab1726b2da054afea8fd761af74bad79bd52381b84eae408e983c76831"},
+]
+
+[package.dependencies]
+attrs = ">=23.2.0"
+cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+idna = "*"
+outcome = "*"
+sniffio = ">=1.3.0"
+sortedcontainers = "*"
+
+[[package]]
+name = "trio-websocket"
+version = "0.11.1"
+description = "WebSocket library for Trio"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "trio-websocket-0.11.1.tar.gz", hash = "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f"},
+    {file = "trio_websocket-0.11.1-py3-none-any.whl", hash = "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+trio = ">=0.11"
+wsproto = ">=0.14"
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"
@@ -2951,6 +3079,9 @@ files = [
     {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
 ]
 
+[package.dependencies]
+pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
+
 [package.extras]
 brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
 h2 = ["h2 (>=4,<5)"]
@@ -2988,6 +3119,52 @@ files = [
     {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
 ]
 
+[[package]]
+name = "webdriver-manager"
+version = "4.0.2"
+description = "Library provides the way to automatically manage drivers for different browsers"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "webdriver_manager-4.0.2-py2.py3-none-any.whl", hash = "sha256:75908d92ecc45ff2b9953614459c633db8f9aa1ff30181cefe8696e312908129"},
+    {file = "webdriver_manager-4.0.2.tar.gz", hash = "sha256:efedf428f92fd6d5c924a0d054e6d1322dd77aab790e834ee767af392b35590f"},
+]
+
+[package.dependencies]
+packaging = "*"
+python-dotenv = "*"
+requests = "*"
+
+[[package]]
+name = "websocket-client"
+version = "1.8.0"
+description = "WebSocket client for Python with low level API options"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"},
+    {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
+]
+
+[package.extras]
+docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"]
+optional = ["python-socks", "wsaccel"]
+test = ["websockets"]
+
+[[package]]
+name = "wsproto"
+version = "1.2.0"
+description = "WebSockets state-machine based protocol implementation"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"},
+    {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
+]
+
+[package.dependencies]
+h11 = ">=0.9.0,<1"
+
 [[package]]
 name = "zipp"
 version = "3.20.2"
@@ -3010,4 +3187,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "9ec432931df5418a41686e0f873622ba5cdb825ad2631d4433e97e644dabb1b8"
+content-hash = "7d5caf8b8e33f06bd4898f16a920530049145487b84afdbe9bf4c6d7d23f62c9"

+ 2 - 0
pyproject.toml

@@ -11,6 +11,8 @@ include = ["LICENCE"]  # poetry uses US English so assumes it will be spelt LICE
 [tool.poetry.dependencies]
 python = "^3.10"
 datalad = "0.19.6"
+selenium = "^4.26.1"
+webdriver-manager = "^4.0.2"
 
 
 [tool.poetry.group.tests.dependencies]

+ 49 - 3
requirements.txt

@@ -1,6 +1,9 @@
 annexremote==1.6.6 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:5f78d0753c0763d95fc4c52050bd6212bb32457d32f6575dc66a83178e0283a7 \
     --hash=sha256:dee4efa33c3bd9514928af5c57c82599ca9dc0a5535121ee234ed1833a98f93e
+attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346 \
+    --hash=sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2
 backports-tarfile==1.2.0 ; python_version >= "3.10" and python_version < "3.12" \
     --hash=sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34 \
     --hash=sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991
@@ -10,7 +13,7 @@ boto==2.49.0 ; python_version >= "3.10" and python_version < "4.0" \
 certifi==2024.8.30 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \
     --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9
-cffi==1.17.1 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux" and platform_python_implementation != "PyPy" \
+cffi==1.17.1 ; python_version >= "3.10" and python_version < "4.0" and os_name == "nt" and implementation_name != "pypy" or python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux" and platform_python_implementation != "PyPy" \
     --hash=sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8 \
     --hash=sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2 \
     --hash=sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1 \
@@ -224,9 +227,15 @@ datalad==0.19.6 ; python_version >= "3.10" and python_version < "4.0" \
 distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed \
     --hash=sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2
+exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11" \
+    --hash=sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b \
+    --hash=sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc
 fasteners==0.19 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:758819cb5d94cdedf4e836988b74de396ceacb8e2794d21f82d131fd9ee77237 \
     --hash=sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c
+h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d \
+    --hash=sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761
 humanize==4.11.0 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \
     --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be
@@ -328,6 +337,9 @@ msgpack==1.1.0 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \
     --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \
     --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788
+outcome==1.3.0.post0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8 \
+    --hash=sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b
 packaging==24.1 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
     --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124
@@ -337,9 +349,16 @@ patool==3.0.3 ; python_version >= "3.10" and python_version < "4.0" \
 platformdirs==4.3.6 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \
     --hash=sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb
-pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux" and platform_python_implementation != "PyPy" \
+pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0" and os_name == "nt" and implementation_name != "pypy" or python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux" and platform_python_implementation != "PyPy" \
     --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \
     --hash=sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc
+pysocks==1.7.1 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299 \
+    --hash=sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5 \
+    --hash=sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0
+python-dotenv==1.0.1 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca \
+    --hash=sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a
 python-gitlab==5.0.0 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:2af86a1655385c2afe13e33e79555d5394f8c7912ade04bb6e916e3d21a8716f \
     --hash=sha256:d156b9810d2a5c2916226cecf354956286a9e8133ee707d6584e40f126329956
@@ -355,15 +374,42 @@ requests==2.32.3 ; python_version >= "3.10" and python_version < "4.0" \
 secretstorage==3.3.3 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux" \
     --hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \
     --hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99
+selenium==4.26.1 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:1db3f3a0cd5bb07624fa8a3905a6fdde1595a42185a0617077c361dc53d104fb \
+    --hash=sha256:7640f3f08ae7f4e450f895678e8a10a55eb4e4ca18311ed675ecc4684b96b683
+sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \
+    --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc
+sortedcontainers==2.4.0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88 \
+    --hash=sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0
 tqdm==4.67.0 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:0cd8af9d56911acab92182e88d763100d4788bdf421d251616040cc4d44863be \
     --hash=sha256:fe5a6f95e6fe0b9755e9469b77b9c3cf850048224ecaa8293d7d2d31f97d869a
-typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "3.11" \
+trio-websocket==0.11.1 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f \
+    --hash=sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638
+trio==0.27.0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:1dcc95ab1726b2da054afea8fd761af74bad79bd52381b84eae408e983c76831 \
+    --hash=sha256:68eabbcf8f457d925df62da780eff15ff5dc68fd6b367e2dde59f7aaf2a0b884
+typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \
     --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8
 urllib3==2.2.3 ; python_version >= "3.10" and python_version < "4.0" \
     --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \
     --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9
+urllib3[socks]==2.2.3 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \
+    --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9
+webdriver-manager==4.0.2 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:75908d92ecc45ff2b9953614459c633db8f9aa1ff30181cefe8696e312908129 \
+    --hash=sha256:efedf428f92fd6d5c924a0d054e6d1322dd77aab790e834ee767af392b35590f
+websocket-client==1.8.0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526 \
+    --hash=sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da
+wsproto==1.2.0 ; python_version >= "3.10" and python_version < "4.0" \
+    --hash=sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065 \
+    --hash=sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736
 zipp==3.20.2 ; python_version >= "3.10" and python_version < "3.12" \
     --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \
     --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29

+ 1 - 1
scripts/remove_downloads.py

@@ -18,7 +18,7 @@ from faostat_data_primap.helper.definitions import downloaded_data_path
 @click.option(
     "--date",
     help="The day on which the data to be deleted was downloaded",
-    default=None,
+    default="2024-11-11",
 )
 def run(date: str):
     """

+ 59 - 17
src/faostat_data_primap/download.py

@@ -1,44 +1,94 @@
 """Downloads data from FAOSTAT website."""
 
-import os
+import time
 import zipfile
 from datetime import datetime
 
 import datalad.api
+from bs4 import BeautifulSoup
 from helper.definitions import downloaded_data_path, root_path
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+
+
+class DateTagNotFoundError(Exception):
+    """
+    The date when the data set was last updated could not be found
+    """
+
+
+def __init__(
+    self, message="The <p> tag with data-role='date' was not found on the page."
+):
+    super().__init__(message)
+
 
 if __name__ == "__main__":
     sources = [
         (
             "farm_gate_emissions_crops",
+            "https://www.fao.org/faostat/en/#data/GCE",
             "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
         ),
         (
             "farm_gate_livestock",
+            "https://www.fao.org/faostat/en/#data/GLE",
             "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
         ),
         (
             "farm_gate_agriculture_energy",
+            "https://www.fao.org/faostat/en/#data/GN",
             "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
         ),
         (
             "land_use_forests",
+            "https://www.fao.org/faostat/en/#data/GF",
             "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
         ),
         (
             "land_use_fires",
+            "https://www.fao.org/faostat/en/#data/GI",
             "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
         ),
         (
             "land_use_drained_organic_soils",
+            "https://www.fao.org/faostat/en/#data/GV",
             "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
         ),
         (
             "pre_post_agricultural_production",
+            "https://www.fao.org/faostat/en/#data/GPP",
             "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
         ),
     ]
-    for ds_name, url in sources:
+    for (
+        ds_name,
+        url,
+        url_download,
+    ) in sources:
+        # If the driver isn't found on your system PATH, Selenium
+        # will automatically download it for you. Make sure there is no
+        # chromedriver installed on your system
+        service = Service()
+        driver = webdriver.Chrome(service=service)
+
+        driver.get(url)
+
+        # give time to load javascript
+        time.sleep(3)
+
+        html_content = driver.page_source
+
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        date_tag = soup.find("p", {"data-role": "date"})
+
+        if not date_tag:
+            msg = "The <p> tag with data-role='date' was not found on the page."
+            raise DateTagNotFoundError(msg)
+
+        last_updated = date_tag.get_text()
+
         # make downloaded_data folder if it doesn't exist yet
         if not downloaded_data_path.exists():
             downloaded_data_path.mkdir()
@@ -49,22 +99,22 @@ if __name__ == "__main__":
             ds_path.mkdir()
 
         # create unique directory
-        # TODO unique name to be discussed
-        today = datetime.today().strftime("%Y-%m-%d")
-        local_data_dir = ds_path / today
+        last_updated_iso = datetime.strptime(last_updated, "%B %d, %Y").strftime(
+            "%Y-%m-%d"
+        )
+        local_data_dir = ds_path / last_updated_iso
 
         if not local_data_dir.exists():
             local_data_dir.mkdir()
 
         # download and commit with datalad
-        local_filename = local_data_dir / "test.zip"
+        local_filename = local_data_dir / f"{ds_name}.zip"
         datalad.api.download_url(
-            urls=url,
+            urls=url_download,
             message=f"Added {ds_name}",
             path=str(local_filename),
         )
 
-        # unzip
         if local_filename.exists():
             print(f"Download => {local_filename.relative_to(root_path)}")
             # unzip data (only for new downloads)
@@ -74,7 +124,7 @@ if __name__ == "__main__":
                     zipped_file.extractall(str(local_filename.parent))
                     print(f"Extracted {len(zipped_file.namelist())} files.")
                     zipped_file.close()
-                    os.remove(local_filename)
+                    # os.remove(local_filename)
                 # TODO Better error logging/visibilty
                 except zipfile.BadZipFile:
                     print(
@@ -90,11 +140,3 @@ if __name__ == "__main__":
                     f"Not attempting to extract "
                     f"{local_filename.relative_to(root_path)}."
                 )
-
-        # Questions:
-        # * Push to datalad .zip and unzipped, or only unzipped?
-        # * What unique directory name to use -
-        # date or last updated from main data page?
-        # * Pydoit to execute download script that stages files
-        # and then push via command line
-        # or is there another solution?