瀏覽代碼

Code to fetch annex-I submission lists and to download submissions

Johannes Gütschow 3 年之前
父節點
當前提交
146033ed96

+ 0 - 20
Makefile

@@ -1,28 +1,8 @@
 .SILENT: help
 help:
 	echo Options:
-	echo make update-bur: Update list of BUR submissions
-	echo make download-bur: Download BUR submissions
-	echo make update-nc: Update list of NC submissions
-	echo make download-nc: Download NC submissions
-	echo make download-ndc: Download NDC submissions
 	echo make venv: create virtual environment
 
-update-bur: venv
-	datalad run -m "Fetch BUR submissions" -o downloaded_data/UNFCCC/submissions-bur.csv ./venv/bin/python code/UNFCCC_downloader/fetch_submissions_bur.py
-
-download-bur: venv
-	datalad run -m "Download BUR submissions" -i downloaded_data/UNFCCC/submissions-bur.csv ./venv/bin/python code/UNFCCC_downloader/download_bur.py
-
-update-nc: venv
-	datalad run -m "Fetch NC submissions" -o downloaded_data/UNFCCC/submissions-nc.csv ./venv/bin/python code/UNFCCC_downloader/fetch_submissions_nc.py
-
-download-nc: venv
-	datalad run -m "Download NC submissions" -i downloaded_data/UNFCCC/submissions-nc.csv ./venv/bin/python code/UNFCCC_downloader/download_nc.py
-
-download-ndc: venv
-	datalad run -m "Download NDC submissions" ./venv/bin/python code/UNFCCC_downloader/download_ndc.py
-
 venv: code/requirements.txt
 	[ -d ./venv ] || python3 -m venv venv
 	./venv/bin/pip install --upgrade pip

+ 181 - 0
code/UNFCCC_downloader/download_annexI.py

@@ -0,0 +1,181 @@
+import argparse
+import pandas as pd
+import requests
+import shutil
+import time
+import os
+import zipfile
+from datetime import date
+from selenium.webdriver import Firefox
+from selenium.webdriver.firefox.options import Options
+from random import randrange
+from pathlib import Path
+
+root = Path(__file__).parents[2]
+
+###############
+#
+# TODO
+# download directly via selenium see link below
+# https://sqa.stackexchange.com/questions/2197/
+# how-to-download-a-file-using-seleniums-webdriver
+# for automatic downloading see https://stackoverflow.com/questions/70740163/
+# python-selenium-firefox-driver-dismiss-open-save-file-popup
+###############
+
+descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
+        'Based on download.py from national-inventory-submissions ' \
+        '(https://github.com/openclimatedata/national-inventory-submisions)'
+parser = argparse.ArgumentParser(description=descr)
+parser.add_argument(
+    '--category',
+    help='Category to download, CRF, NIR, SEF'
+)
+parser.add_argument(
+    '--year',
+    help='Year to download'
+)
+
+args = parser.parse_args()
+year = args.year
+category = args.category.upper()
+dataset = category + year
+print(f"Downloading data for {dataset}")
+
+# generate the correct url
+url = (
+    "https://unfccc.int/process/transparency-and-reporting/"
+    "reporting-and-review-under-the-convention/"
+    "greenhouse-gas-inventories-annex-i-parties/"
+    "submissions/national-inventory-submissions-{}".format(year)
+)
+
+# TODO: years before 2019
+if int(year) == 2019:
+    url = (
+           "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
+           "reporting-and-review-under-the-convention/"
+           "greenhouse-gas-inventories-annex-i-parties/"
+           "national-inventory-submissions-{}".format(year)
+          )
+
+if int(year) >= 2020:
+    url = (
+            "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
+            )
+
+download_path = root / "downloaded_data" / "UNFCCC"
+
+error_file_sizes = [212, 210]
+
+# Read submissions list
+submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
+
+# filter submissions list or category
+items = submissions[submissions.Kind  == category.upper()]
+
+# set options for headless mode
+profile_path = ".firefox"
+options = Options()
+#options.add_argument('-headless')
+
+# create profile for headless mode and automatic downloading
+options.set_preference('profile', profile_path)
+options.set_preference('browser.download.folderList', 2)
+
+# set up selenium driver
+driver = Firefox(options=options)
+# visit the main data page once to create cookies
+driver.get(url)
+
+# wait a bit for the website to load before we get the cokkies
+time.sleep(20)
+
+# get the session id cookie
+cookies_selenium = driver.get_cookies()
+cookies = {}
+for cookie in cookies_selenium:
+    cookies[cookie['name']] = cookie['value']
+
+new_downloaded = []
+
+for idx, submission in items.iterrows():
+    print("=" * 60)
+    title = submission.Title
+    url = submission.URL
+    country = submission.Country
+    country = country.replace(' ', '_')
+    print(f"Downloading {title} from {url}")
+
+    country_folder = download_path / country
+    if not country_folder.exists():
+        country_folder.mkdir()
+    local_filename = \
+        country_folder / dataset / \
+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
+    if not local_filename.parent.exists():
+        local_filename.parent.mkdir()
+
+    if local_filename.exists():
+        # check file size. if 210 or 212 bytes it's the error page
+        if Path(local_filename).stat().st_size in error_file_sizes:
+            # found the error page. delete file
+            os.remove(local_filename)
+    
+    # now we have removed error pages, so a present file should not be overwritten
+    if not local_filename.exists():
+        i = 0  # reset counter
+        while not local_filename.exists() and i < 10:
+            # for i = 0 and i = 5 try to get a new session ID
+            if i == 1 or i == 5:
+                driver = webdriver.Firefox(options=options)
+    
+                # visit the main data page once to create cookies
+                driver.get(url)
+                time.sleep(20)
+
+                # get the session id cookie
+                cookies_selenium = driver.get_cookies()
+                cookies = {}
+                for cookie in cookies_selenium:
+                    cookies[cookie['name']] = cookie['value']
+
+            r = requests.get(url, stream=True, cookies=cookies)
+            with open(str(local_filename), 'wb') as f:
+                shutil.copyfileobj(r.raw, f)
+            
+            # check file size. if 210 or 212 bytes it's the error page
+            if Path(local_filename).stat().st_size in error_file_sizes:
+                # found the error page. delete file
+                os.remove(local_filename)
+            
+            # sleep a bit to avoid running into captchas
+            time.sleep(randrange(5, 15))
+            
+        if local_filename.exists():
+            new_downloaded.append(submission)
+            print(f"Download => {local_filename.relative_to(root)}")
+            # unzip data (only for new downloads)
+            if local_filename.suffix == ".zip":
+                try:
+                    zipped_file = zipfile.ZipFile(str(local_filename), 'r')
+                    zipped_file.extractall(str(local_filename.parent))
+                    print(f"Extracted {len(zipped_file.namelist())} files.")
+                    zipped_file.close()
+                # TODO Better error logging/visibilty
+                except zipfile.BadZipFile:
+                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
+                except NotImplementedError:
+                    print("Zip format not supported, please unzip on the command line.")
+            else:
+                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
+        else:
+            print(f"Failed to download {local_filename.relative_to(root)}")
+
+    else:
+        print(f"=> Already downloaded {local_filename.relative_to(root)}")
+
+driver.close()
+
+df = pd.DataFrame(new_downloaded)
+df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)

+ 0 - 146
code/UNFCCC_downloader/download_bur.py

@@ -1,146 +0,0 @@
-import pandas as pd
-import requests
-import shutil
-import time
-import os
-from datetime import date
-from selenium.webdriver import Firefox
-from selenium.webdriver.firefox.options import Options
-from random import randrange
-
-from pathlib import Path
-root = Path(__file__).parents[2]
-"""
-based on download_bur from national-inventory-submissions
-# (https://github.com/openclimatedata/national-inventory-submisions)
-"""
-
-###############
-#
-# TODO
-# download directly via selenium see link below
-# https://sqa.stackexchange.com/questions/2197/
-# how-to-download-a-file-using-seleniums-webdriver
-###############
-
-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
-                          "submissions-bur.csv")
-
-url = "https://unfccc.int/BURs"
-
-# if we get files of this size they are error pages and we need to
-# try the download again
-error_file_sizes = [212, 210]
-
-# find which BUR submission rounds exist
-present_BURs = submissions.Kind.unique()
-
-# Ensure download path and subfolders exist
-download_path = root / "downloaded_data/UNFCCC"
-if not download_path.exists():
-    download_path.mkdir(parents=True)
-
-# set options for headless mode
-profile_path = ".firefox"
-options = Options()
-#options.add_argument('-headless')
-
-# create profile for headless mode and automatic downloading
-options.set_preference('profile', profile_path)
-options.set_preference('browser.download.folderList', 2)
-
-# set up selenium driver
-driver = Firefox(options=options)
-# visit the main data page once to create cookies
-driver.get(url)
-time.sleep(20)
-
-# get the session id cookie
-cookies_selenium = driver.get_cookies()
-cookies = {}
-for cookie in cookies_selenium:
-    cookies[cookie['name']] = cookie['value']
-
-print(cookies)
-
-new_downloaded = []
-
-for idx, submission in submissions.iterrows():
-    print("=" * 60)
-    bur = submission.Kind
-    title = submission.Title
-    url = submission.URL
-    country = submission.Country
-    country = country.replace(' ', '_')
-    print(title)
-
-    country_folder = download_path / country
-    if not country_folder.exists():
-        country_folder.mkdir()
-    local_filename = country_folder / bur / url.split('/')[-1]
-    local_filename_underscore = \
-        download_path / country / bur / \
-        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
-    if not local_filename.parent.exists():
-        local_filename.parent.mkdir()
-
-    ### remove, not needed as no legacy data present
-    #if local_filename.exists():
-    #    # rename
-    #    local_filename.rename(local_filename_underscore)
-    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)
-
-    # this should never be needed but in case anything goes wrong and
-    # an error page is present it should be overwritten
-    if local_filename_underscore.exists():
-        # check file size. if 210 or 212 bytes it's the error page
-        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
-            # found the error page. delete file
-            os.remove(local_filename_underscore)
-    
-    # now we have remove error pages, so a present file should not be overwritten
-    if not local_filename_underscore.exists():
-        i = 0  # reset counter
-        while not local_filename_underscore.exists() and i < 10:
-            # for i = 0 and i = 5 try to get a new session ID
-            if i == 1 or i == 5:
-                driver = webdriver.Firefox(options=options,
-                                           firefox_profile=profile)
-    
-                # visit the main data page once to create cookies
-                driver.get(url)
-                time.sleep(20)
-                
-                # get the session id cookie
-                cookies_selenium = driver.get_cookies()
-                cookies = {}
-                for cookie in cookies_selenium:
-                    cookies[cookie['name']] = cookie['value']
-                    
-            r = requests.get(url, stream=True, cookies=cookies)
-            with open(str(local_filename_underscore), 'wb') as f:
-                shutil.copyfileobj(r.raw, f)
-            
-            # check file size. if 210 or 212 bytes it's the error page
-            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
-                # found the error page. delete file
-                os.remove(local_filename_underscore)
-            
-            # sleep a bit to avoid running into captchas
-            time.sleep(randrange(5, 15))
-            
-        if local_filename_underscore.exists():
-            new_downloaded.append(submission)
-            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
-                  "/" + local_filename_underscore.name)
-        else:
-            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
-                  + bur + "/" + local_filename_underscore.name)
-
-    else:
-        print("=> Already downloaded " + local_filename_underscore.name)
-
-driver.close()
-
-df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / "00_new_downloads_bur-{}.csv".format(date.today()), index=False)

+ 42 - 46
code/UNFCCC_downloader/download_nc.py → code/UNFCCC_downloader/download_non-annexI.py

@@ -1,3 +1,4 @@
+import argparse
 import pandas as pd
 import requests
 import shutil
@@ -7,13 +8,9 @@ from datetime import date
 from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from random import randrange
-
 from pathlib import Path
+
 root = Path(__file__).parents[2]
-"""
-based on download_bur from national-inventory-submissions
-# (https://github.com/openclimatedata/national-inventory-submisions)
-"""
 
 ###############
 #
@@ -25,19 +22,31 @@ based on download_bur from national-inventory-submissions
 # python-selenium-firefox-driver-dismiss-open-save-file-popup
 ###############
 
-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
-                          "submissions-nc.csv")
+descr = 'Download data from UNFCCC non-AnnexI Submissions. ' \
+        'Based on download_bur.py from national-inventory-submissions ' \
+        '(https://github.com/openclimatedata/national-inventory-submisions)'
+parser = argparse.ArgumentParser(description=descr)
+parser.add_argument(
+    '--category',
+    help='Category to download: BUR, NC'
+)
 
-url = "https://unfccc.int/non-annex-I-NCs"
+args = parser.parse_args()
+category = args.category.upper()
+print(f"Downloading {category} submissions")
+
+if category == "BUR":
+    url = "https://unfccc.int/BURs"
+else:
+    url = "https://unfccc.int/non-annex-I-NCs"
 
 # if we get files of this size they are error pages and we need to
 # try the download again
 error_file_sizes = [212, 210]
 
-# Ensure download path and subfolders exist
+# Read submissions list
 download_path = root / "downloaded_data" / "UNFCCC"
-if not download_path.exists():
-    download_path.mkdir(parents=True)
+submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
 
 # set options for headless mode
 profile_path = ".firefox"
@@ -53,6 +62,7 @@ driver = Firefox(options=options)
 # visit the main data page once to create cookies
 driver.get(url)
 
+# wait a bit for the website to load before we get the cookies
 time.sleep(20)
 
 # get the session id cookie
@@ -61,86 +71,72 @@ cookies = {}
 for cookie in cookies_selenium:
     cookies[cookie['name']] = cookie['value']
 
-print(cookies)
-
 new_downloaded = []
 
 for idx, submission in submissions.iterrows():
     print("=" * 60)
-    bur = submission.Kind
+    kind = submission.Kind
     title = submission.Title
     url = submission.URL
     country = submission.Country
     country = country.replace(' ', '_')
-    print(title)
+    print(f"Downloading {title} from {url}")
 
     country_folder = download_path / country
     if not country_folder.exists():
         country_folder.mkdir()
-    local_filename = country_folder / bur / url.split('/')[-1]
-    local_filename_underscore = \
-        download_path / country / bur / \
+    local_filename = \
+        country_folder / kind / \
         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
     if not local_filename.parent.exists():
         local_filename.parent.mkdir()
 
-    ### remove, not needed as no legacy data present
-    #if local_filename.exists():
-    #    # rename
-    #    local_filename.rename(local_filename_underscore)
-    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)
-
-    # this should never be needed but in case anything goes wrong and
-    # an error page is present it should be overwritten
-    if local_filename_underscore.exists():
+    if local_filename.exists():
         # check file size. if 210 or 212 bytes it's the error page
-        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
+        if Path(local_filename).stat().st_size in error_file_sizes:
             # found the error page. delete file
-            os.remove(local_filename_underscore)
+            os.remove(local_filename)
     
-    # now we have remove error pages, so a present file should not be overwritten
-    if not local_filename_underscore.exists():
+    # now we have removed error pages, so a present file should not be overwritten
+    if not local_filename.exists():
         i = 0  # reset counter
-        while not local_filename_underscore.exists() and i < 10:
+        while not local_filename.exists() and i < 10:
             # for i = 0 and i = 5 try to get a new session ID
             if i == 1 or i == 5:
-                driver = webdriver.Firefox(options=options,
-                                           firefox_profile=profile)
+                driver = Firefox(options=options)
     
                 # visit the main data page once to create cookies
                 driver.get(url)
                 time.sleep(20)
-                
+
                 # get the session id cookie
                 cookies_selenium = driver.get_cookies()
                 cookies = {}
                 for cookie in cookies_selenium:
                     cookies[cookie['name']] = cookie['value']
-                    
+
             r = requests.get(url, stream=True, cookies=cookies)
-            with open(str(local_filename_underscore), 'wb') as f:
+            with open(str(local_filename), 'wb') as f:
                 shutil.copyfileobj(r.raw, f)
             
             # check file size. if 210 or 212 bytes it's the error page
-            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
+            if Path(local_filename).stat().st_size in error_file_sizes:
                 # found the error page. delete file
-                os.remove(local_filename_underscore)
+                os.remove(local_filename)
             
             # sleep a bit to avoid running into captchas
             time.sleep(randrange(5, 15))
             
-        if local_filename_underscore.exists():
+        if local_filename.exists():
             new_downloaded.append(submission)
-            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
-                  "/" + local_filename_underscore.name)
+            print(f"Download => {local_filename.relative_to(root)}")
         else:
-            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
-                  + bur + "/" + local_filename_underscore.name)
+            print(f"Failed to download {local_filename.relative_to(root)}")
 
     else:
-        print("=> Already downloaded " + local_filename_underscore.name)
+        print(f"=> Already downloaded {local_filename.relative_to(root)}")
 
 driver.close()
 
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / "00_new_downloads_nc-{}.csv".format(date.today()), index=False)
+df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)

+ 140 - 0
code/UNFCCC_downloader/fetch_submissions_annexI.py

@@ -0,0 +1,140 @@
+import argparse
+import time
+import pandas as pd
+
+from pathlib import Path
+from bs4 import BeautifulSoup
+from selenium.webdriver import Firefox
+from selenium.webdriver.firefox.options import Options
+from random import randrange
+from unfccc_submission_info import get_unfccc_submission_info
+
+root = Path(__file__).absolute().parents[2]
+
+max_tries = 10
+
+descr = ("Download UNFCCC National Inventory Submissions lists "
+         "and create list of submissions as CSV file. Based on "
+         "process.py from national-inventory-submissions "
+         "(https://github.com/openclimatedata/national-inventory-submisions)")
+parser = argparse.ArgumentParser(description=descr)
+parser.add_argument(
+    '--year',
+    help='Year to download'
+)
+
+args = parser.parse_args()
+year = args.year
+
+print("Fetching submissions for {}".format(year))
+
+url = (
+    "https://unfccc.int/process/transparency-and-reporting/"
+    "reporting-and-review-under-the-convention/"
+    "greenhouse-gas-inventories-annex-i-parties/"
+    "submissions/national-inventory-submissions-{}".format(year)
+)
+
+if int(year) == 2019:
+    url = (
+        "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
+        "reporting-and-review-under-the-convention/"
+        "greenhouse-gas-inventories-annex-i-parties/"
+        "national-inventory-submissions-{}".format(year)
+    )
+
+if int(year) >= 2020:
+    url = (
+        "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
+    )
+
+print(f"Using {url} to get submissions list")
+
+# set options for headless mode
+profile_path = ".firefox"
+options = Options()
+options.add_argument('-headless')
+
+# create profile for headless mode and automatic downloading
+options.set_preference('profile', profile_path)
+
+# set up selenium driver
+driver = Firefox(options=options)
+driver.get(url)
+
+html = BeautifulSoup(driver.page_source, "html.parser")
+
+table = html.find("table")
+
+# check if table found. if not the get command didn't work, likely because of a captcha on the site
+### TODO replace by error message
+if not table:
+    # try to load html file from disk
+    print('Download failed, trying to load manually downloaded file')
+    file = open("manual_page_downloads/National-Inventory-Submissions-{}.html".format(year))
+    content = file.read()
+    html = BeautifulSoup(content, "html.parser")
+    table = html.find("table")
+    if not table:
+        print(
+            "Manually downloaded file " + "manual_page_downloads/National-Inventory-Submissions-{}.html".format(year) +
+            " not found")
+        exit()
+
+links = table.findAll('a')
+
+targets = []  # sub-pages
+downloads = []
+no_downloads = []
+
+# Check links for Zipfiles or subpages
+for link in links:
+    if "href" not in link.attrs:
+        continue
+    href = link.attrs["href"]
+    if "/documents/" in href:
+        if "title" in link.attrs.keys():
+            title = link.attrs["title"]
+        else:
+            title = link.contents[0]
+        if href.startswith("/documents"):
+            href = "https://unfccc.int" + href
+        # Only add pages in the format https://unfccc.int/documents/65587
+        # to further downloads
+        if str(Path(href).parent).endswith("documents"):
+            targets.append({"title": title, "url": href})
+    elif href.endswith(".zip"):
+        if href.startswith("/files"):
+            href = "https://unfccc.int" + href
+        country = Path(href).name.split("-")[0].upper()
+        title = f"{country} {link.contents[0]}"
+        filename = Path(href).name
+        file_parts = filename.split('-')
+        if len(file_parts) >= 2:
+            kind = file_parts[2].upper()
+        elif filename.startswith('asr'):
+            kind = 'CRF'
+        else:
+            kind = None
+
+        print("\t".join([kind, country, title, href]))
+        downloads.append({"Kind": kind, "Country": country, "Title": title, "URL": href})
+
+# Go through sub-pages.
+for target in targets:
+    time.sleep(randrange(5, 15))
+    url = target["url"]
+
+    submission_info = get_unfccc_submission_info(url, driver, 10)
+
+    if submission_info:
+        downloads = downloads + submission_info
+    else:
+        no_downloads.append({target["title"], url})
+
+if len(no_downloads) > 0:
+    print("No downloads for ", no_downloads)
+
+driver.close()
+df = pd.DataFrame(downloads)
+df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)

+ 9 - 51
code/UNFCCC_downloader/fetch_submissions_bur.py

@@ -8,8 +8,9 @@ from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from random import randrange
+from unfccc_submission_info import get_unfccc_submission_info
 
-root = Path(__file__).parents[2]
+root = Path(__file__).absolute().parents[2]
 
 """
 Download UNFCCC Biennial Update Report submissions
@@ -68,57 +69,14 @@ pattern = re.compile(r"BUR ?\d")
 for target in targets:
     time.sleep(randrange(5, 15))
     url = target["url"]
-    #subpage = requests.get(url, timeout=15.5)
-    driver.get(url)
-    html = BeautifulSoup(driver.page_source, "html.parser")
-    title = html.find("h1").contents[0]
-    match = pattern.search(title)
-    if match:
-        kind = match.group(0).replace(" ", "")
-    else:
-        kind = None
-
-
-    h2 = html.find("h2", text="Versions")
-    if h2:
-        div = h2.findNext("div")
-        links = div.findAll("a")
-        try:
-            country = (
-                html.find("h2", text="Countries").findNext("div").findNext("div").text
-            )
-        except AttributeError:
-            country = (
-                html.find("h2", text="Corporate Author")
-                .findNext("div")
-                .findNext("div")
-                .text
-            )
-        doctype = (
-            html.find("h2", text="Document Type").findNext("div").findNext("div").text
-        )
-        for link in links:
-            url = link.attrs["href"]
-            if not kind:
-                match = pattern.search(url.upper())
-                if match:
-                    kind = match.group(0)
-                else:
-                    if ("NIR" in doctype) or ("NIR" in title):
-                        kind = "NIR"
-                    elif "NC" in title:
-                        kind = "NC"
-            downloads.append(
-                {
-                    "Kind": kind,
-                    "Country": country,
-                    "Title": title,
-                    "URL": url,
-                }
-            )
-        print("\t".join([kind, country, title, url]))
+
+    submission_info = get_unfccc_submission_info(url, driver, 10)
+
+    if submission_info:
+        downloads = downloads + submission_info
     else:
-        no_downloads.append((title, url))
+        no_downloads.append({target["title"], url})
+
 
 if len(no_downloads) > 0:
     print("No downloads for ", no_downloads)

+ 1 - 1
code/UNFCCC_downloader/fetch_submissions_nc.py

@@ -9,7 +9,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 
-root = Path(__file__).parents[2]
+root = Path(__file__).absolute().parents[2]
 
 """
 Download UNFCCC Biennial Update Report submissions

+ 89 - 0
code/UNFCCC_downloader/unfccc_submission_info.py

@@ -0,0 +1,89 @@
+# helper functions to gather submission info from UNFCCC website
+import time
+import re
+from random import randrange
+from typing import Dict, List
+from selenium.webdriver import Firefox
+from bs4 import BeautifulSoup
+
+
+def get_unfccc_submission_info(
+        url: str,
+        driver: Firefox,
+        max_tries: int=10,
+
+) -> List[Dict[str,str]]:
+    info = []
+    pattern = re.compile(r"BUR ?\d")
+    i = 0
+    while i < max_tries:
+        try:
+            driver.get(url)
+            html = BeautifulSoup(driver.page_source, "html.parser")
+            title = html.find("h1").contents[0]
+            break
+        except AttributeError:
+            print(f"Error fetching {url}")
+            print("Retrying ...")
+            time.sleep(randrange(5, 15))
+            i += 1
+            continue
+
+    if i == max_tries:
+        print(f"Aborting after {max_tries} tries")
+    else:
+        match = pattern.search(title)
+        if match:
+            kind = match.group(0).replace(" ", "")
+        else:
+            kind = None
+
+        h2 = html.find("h2", text="Versions")
+        if h2:
+            div = h2.findNext("div")
+            links = div.findAll("a")
+            try:
+                country = (
+                    html.find("h2", text="Countries").findNext("div").findNext("div").text
+                )
+            except AttributeError:
+                country = (
+                    html.find("h2", text="Corporate Author")
+                    .findNext("div")
+                    .findNext("div")
+                    .text
+                )
+            doctype = (
+                html.find("h2", text="Document Type").findNext("div").findNext("div").text
+            )
+            for link in links:
+                url = link.attrs["href"]
+                if not kind:
+                    match = pattern.search(url.upper())
+                    if match:
+                        kind = match.group(0)
+                    else:
+                        if ("CRF" in doctype) or ("CRF" in title):
+                            kind = "CRF"
+                        elif ("SEF" in doctype) or ("SEF" in title):
+                            kind = "SEF"
+                        elif ("NIR" in doctype) or ("NIR" in title):
+                            kind = "NIR"
+                        elif "NC" in title:
+                            kind = "NC"
+                        elif "Status report" in title:
+                            kind = "CRF"
+                        else:
+                            kind = "other"
+                info.append({
+                    "Kind": kind,
+                    "Country": country,
+                    "Title": title,
+                    "URL": url,
+                })
+
+            print("\t".join([kind, country, title, url]))
+        else:
+            print(f"No files found for {url}")
+
+    return info

+ 40 - 2
dodo.py

@@ -54,7 +54,7 @@ def task_download_bur():
         # before download
         'actions': ['datalad run -m "Download BUR submissions" '
                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
-                    './venv/bin/python code/UNFCCC_downloader/download_bur.py'],
+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=BUR.py'],
         'verbosity': 2,
         'setup': ['setup_venv'],
     }
@@ -72,6 +72,8 @@ def task_update_nc():
     }
 
 
+
+
 def task_download_nc():
     """ Download NC submissions """
     return {
@@ -80,7 +82,42 @@ def task_download_nc():
         # before download
         'actions': ['datalad run -m "Download NC submissions" '
                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
-                    './venv/bin/python code/UNFCCC_downloader/download_nc.py'],
+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=NC'],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+# annexI data: one update call for all data types (as they are on one page)
+# but for each year separately.
+# downloading is per year and
+update_aI_config = {
+    "year": get_var('year', None),
+    "category": get_var('category', None),
+}
+
+
+def task_update_annexi():
+    """ Update list of annexI submissions """
+    return {
+        'targets': [f"downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv"],
+        'actions': [f"datalad run -m 'Fetch AnnexI submissions for {update_aI_config['year']}'"
+                    f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
+                    f"./venv/bin/python code/UNFCCC_downloader/fetch_submissions_annexI.py "
+                    f"--year={update_aI_config['year']}"],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+
+def task_download_annexi():
+    """ Download NC submissions """
+    return {
+        #'file_dep': ['downloaded_data/UNFCCC/submissions-nc.csv'],
+        # deactivate file_dep fow now as it will always run fetch submissions
+        # before download
+        'actions': ['datalad run -m "Download NC submissions" '
+                    '-i downloaded_data/UNFCCC/submissions-nc.csv '
+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=NC'],
         'verbosity': 2,
         'setup': ['setup_venv'],
     }
@@ -103,6 +140,7 @@ read_config = {
     "submission": get_var('submission', None),
 }
 
+
 def task_read_unfccc_submission():
     """ Read submission for a country (if code exists) """
     return {