+import pandas as pd
+import requests
+import shutil
+import time
+import os
+from datetime import date
+from selenium import webdriver
+from random import randrange
+from pathlib import Path
+root = Path(__file__).parents[2]
+based on download_bur from national-inventory-submissions
+# (
+# download directly via selenium see link below
+# how-to-download-a-file-using-seleniums-webdriver
+submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv")
+# use the CRF data url, for some reason visnting the BUR url
+# is not enough to generate the necessary cookies
+url = ""
+# if we get files of this size they are error pages and we need to
+# try the download again
+error_file_sizes = [212, 210]
+# find which BUR submission rounds exist
+present_BURs = submissions.Kind.unique()
+# Ensure download path and subfolders exist
+download_path = root / "downloaded_data/UNFCCC"
+if not download_path.exists():
+    download_path.mkdir(parents=True)
+for BUR in present_BURs:
+    download_path_BUR = download / BUR
+    if not download_path_BUR.exists():
+        download_path_BUR.mkdir(parents=True)
+# set options for headless mode
+options = webdriver.firefox.options.Options()
+# options.add_argument('-headless')
+# create profile for headless mode 
+profile = webdriver.FirefoxProfile()
+profile.set_preference('', 2)
+# set up selenium driver
+driver = webdriver.Firefox(options=options, firefox_profile=profile)
+# visit the main data page once to create cookies
+# get the session id cookie
+cookies_selenium = driver.get_cookies()
+cookies = {}
+for cookie in cookies_selenium:
+    cookies[cookie['name']] = cookie['value']
+new_downloaded = []
+for idx, submission in submissions.iterrows():
+    print("=" * 60)
+    bur = submission.Kind
+    title = submission.Title
+    url = submission.URL
+    country = submission.Country
+    country = country.replace(' ', '_')
+    print(title)
+    local_filename = download_path / country / bur / url.split('/')[-1]
+    local_filename_underscore = \
+        download_path / country / bur / \
+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
+    if not local_filename.parent.exists():
+        local_filename.parent.mkdir()
+    ### remove, not needed as no legacy data present
+    #if local_filename.exists():
+    #    # rename
+    #    local_filename.rename(local_filename_underscore)
+    #    print("Renamed " + bur + "/" + country + "/" +
+    # this should never be needed but in case anything goes wrong and
+    # an error page is present it should be overwritten
+    if local_filename_underscore.exists():
+        # check file size. if 210 or 212 bytes it's the error page
+        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
+            # found the error page. delete file
+            os.remove(local_filename_underscore)
+    # now we have remove error pages, so a present file should not be overwritten
+    if not local_filename_underscore.exists():
+        i = 0  # reset counter
+        while not local_filename_underscore.exists() and i < 10:
+            # for i = 0 and i = 5 try to get a new session ID
+            if i == 1 or i == 5:
+                driver = webdriver.Firefox(options=options, firefox_profile=profile)
+                # visit the main data page once to create cookies
+                driver.get(url)
+                time.sleep(20)
+                # get the session id cookie
+                cookies_selenium = driver.get_cookies()
+                cookies = {}
+                for cookie in cookies_selenium:
+                    cookies[cookie['name']] = cookie['value']
+            r = requests.get(url, stream=True, cookies = cookies)
+            with open(str(local_filename_underscore), 'wb') as f:
+                shutil.copyfileobj(r.raw, f)
+            # check file size. if 210 or 212 bytes it's the error page
+            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
+                # found the error page. delete file
+                os.remove(local_filename_underscore)
+            # sleep a bit to avoid running into captchas
+            time.sleep(randrange(5, 15))
+        if local_filename_underscore.exists():
+            new_downloaded.append(submission)
+            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
+                  "/" +
+        else:
+            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
+                  + bur + "/" +
+    else:
+        print("=> Already downloaded " +
+df = pd.DataFrame(new_downloaded)
+df.to_csv(download_path / "00_new_downloads-{}.csv".format(, index=False)

+#import requests
+import time
+import pandas as pd
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from random import randrange
+root = Path(__file__).parents[2]
+Download UNFCCC Biennial Update Report submissions
+from Non-Annex I Parties and create list of submissions as CSV file
+Based on `process_bur` from national-inventory-submissions 
+# TODO for NC
+## link is just /documents/XXXXX (but already dealt with in code below)
+## url is
+## pattern needs NC instead of BUR
+print("Fetching BUR submissions ...")
+url = ""
+# set options for headless mode
+options = webdriver.firefox.options.Options()
+# create profile for headless mode and automatic downloading
+profile = webdriver.FirefoxProfile()
+# set up selenium driver
+driver = webdriver.Firefox(options=options, firefox_profile=profile)
+html = BeautifulSoup(driver.page_source, "html.parser")
+table = html.find_all("table")[1]
+links = table.findAll("a")
+targets = []  # sub-pages
+downloads = []
+no_downloads = []
+# Check links for Zipfiles or subpages
+for link in links:
+    if "href" not in link.attrs:
+        continue
+    href = link.attrs["href"]
+    if "/documents/" in href:
+        if "title" in link.attrs.keys():
+            title = link.attrs["title"]
+        else:
+            title = link.contents[0]
+        if href.startswith("/documents"):
+            href = "" + href
+        # Only add pages in the format
+        # to further downloads
+        if str(Path(href).parent).endswith("documents"):
+            targets.append({"title": title, "url": href})
+pattern = re.compile(r"BUR ?\d")
+# Go through sub-pages.
+for target in targets:
+    time.sleep(randrange(5, 15))
+    url = target["url"]
+    #subpage = requests.get(url, timeout=15.5)
+    driver.get(url)
+    html = BeautifulSoup(driver.page_source, "html.parser")
+    title = html.find("h1").contents[0]
+    match =
+    if match:
+        kind =" ", "")
+    else:
+        kind = None
+    h2 = html.find("h2", text="Versions")
+    if h2:
+        div = h2.findNext("div")
+        links = div.findAll("a")
+        try:
+            country = (
+                html.find("h2", text="Countries").findNext("div").findNext("div").text
+            )
+        except AttributeError:
+            country = (
+                html.find("h2", text="Corporate Author")
+                .findNext("div")
+                .findNext("div")
+                .text
+            )
+        doctype = (
+            html.find("h2", text="Document Type").findNext("div").findNext("div").text
+        )
+        for link in links:
+            url = link.attrs["href"]
+            if not kind:
+                match =
+                if match:
+                    kind =
+                else:
+                    if ("NIR" in doctype) or ("NIR" in title):
+                        kind = "NIR"
+                    elif "NC" in title:
+                        kind = "NC"
+            downloads.append(
+                {
+                    "Kind": kind,
+                    "Country": country,
+                    "Title": title,
+                    "URL": url,
+                }
+            )
+        print("\t".join([kind, country, title, url]))
+    else:
+        no_downloads.append((title, url))
+if len(no_downloads) > 0:
+    print("No downloads for ", no_downloads)
+df = pd.DataFrame(downloads)
+df = df[["Kind", "Country", "Title", "URL"]]
+df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)

+#import requests
+import time
+import pandas as pd
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from random import randrange
+root = Path(__file__).parents[2]
+Download UNFCCC Biennial Update Report submissions
+from Non-Annex I Parties and create list of submissions as CSV file
+Based on `process_bur` from national-inventory-submissions 
+# TODO for NC
+## link is just /documents/XXXXX (but already dealt with in code below)
+## url is
+## pattern needs NC instead of BUR
+print("Fetching NC submissions ...")
+url = ""
+# set options for headless mode
+options = webdriver.firefox.options.Options()
+# create profile for headless mode and automatic downloading
+profile = webdriver.FirefoxProfile()
+# set up selenium driver
+driver = webdriver.Firefox(options=options, firefox_profile=profile)
+html = BeautifulSoup(driver.page_source, "html.parser")
+table = html.find_all("table")[1]
+links = table.findAll("a")
+targets = []  # sub-pages
+downloads = []
+no_downloads = []
+# Check links for Zipfiles or subpages
+for link in links:
+    if "href" not in link.attrs:
+        continue
+    href = link.attrs["href"]
+    if "/documents/" in href:
+        if "title" in link.attrs.keys():
+            title = link.attrs["title"]
+        else:
+            title = link.contents[0]
+        if href.startswith("/documents"):
+            href = "" + href
+        # Only add pages in the format
+        # to further downloads
+        if str(Path(href).parent).endswith("documents"):
+            targets.append({"title": title, "url": href})
+pattern = re.compile(r"NC ?\d")
+#skip = True
+# Go through sub-pages.
+for target in targets:
+    #if target["url"] == "":
+    #    skip = False
+    #if skip:
+    #    print(f"Skipping { target['title']}")
+    #    continue
+    time.sleep(randrange(5, 15))
+    url = target["url"]
+    #subpage = requests.get(url, timeout=15.5)
+    driver.get(url)
+    html = BeautifulSoup(driver.page_source, "html.parser")
+    title = html.find("h1").contents[0]
+    match =
+    if match:
+        kind =" ", "")
+    else:
+        kind = None
+    h2 = html.find("h2", text="Versions")
+    if h2:
+        div = h2.findNext("div")
+        links = div.findAll("a")
+        try:
+            country = (
+                html.find("h2", text="Countries").findNext("div").findNext("div").text
+            )
+        except AttributeError:
+            country = (
+                html.find("h2", text="Corporate Author")
+                .findNext("div")
+                .findNext("div")
+                .text
+            )
+        doctype = (
+            html.find("h2", text="Document Type").findNext("div").findNext("div").text
+        )
+        for link in links:
+            url = link.attrs["href"]
+            if not kind:
+                match =
+                if match:
+                    kind =
+                else:
+                    if ("NIR" in doctype) or ("NIR" in title):
+                        kind = "NIR"
+                    elif ("INV" in title) or ("Inventory" in title):
+                        kind = "INV"
+                    else:
+                        print("found unknown record" + url)
+            downloads.append(
+                {
+                    "Kind": kind,
+                    "Country": country,
+                    "Title": title,
+                    "URL": url,
+                }
+            )
+        print("\t".join([kind, country, title, url]))
+    else:
+        no_downloads.append((title, url))
+if len(no_downloads) > 0:
+    print("No downloads for ", no_downloads)
+df = pd.DataFrame(downloads)
+df = df[["Kind", "Country", "Title", "URL"]]
+df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)

