jguetschow
/
UNFCCC_non-AnnexI_data


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
							#import requests
import time
import pandas as pd
import re

from pathlib import Path
from bs4 import BeautifulSoup
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from random import randrange

root = Path(__file__).parents[2]

"""
Download UNFCCC Biennial Update Report submissions
from Non-Annex I Parties and create list of submissions as CSV file
Based on `process_bur` from national-inventory-submissions 
(https://github.com/openclimatedata/national-inventory-submisions)
"""

print("Fetching BUR submissions ...")

url = "https://unfccc.int/BURs"

#print(url)

# set options for headless mode
profile_path = ".firefox"
options = Options()
options.add_argument('-headless')

# create profile for headless mode and automatic downloading
options.set_preference('profile', profile_path)

# set up selenium driver
driver = Firefox(options=options)
driver.get(url)

html = BeautifulSoup(driver.page_source, "html.parser")
table = html.find_all("table")[1]
links = table.findAll("a")

targets = []  # sub-pages
downloads = []
no_downloads = []

# Check links for Zipfiles or subpages
for link in links:
    if "href" not in link.attrs:
        continue
    href = link.attrs["href"]
    if "/documents/" in href:
        if "title" in link.attrs.keys():
            title = link.attrs["title"]
        else:
            title = link.contents[0]
        if href.startswith("/documents"):
            href = "https://unfccc.int" + href
        # Only add pages in the format https://unfccc.int/documents/65587
        # to further downloads
        if str(Path(href).parent).endswith("documents"):
            targets.append({"title": title, "url": href})


pattern = re.compile(r"BUR ?\d")

# Go through sub-pages.
for target in targets:
    time.sleep(randrange(5, 15))
    url = target["url"]
    #subpage = requests.get(url, timeout=15.5)
    driver.get(url)
    html = BeautifulSoup(driver.page_source, "html.parser")
    title = html.find("h1").contents[0]
    match = pattern.search(title)
    if match:
        kind = match.group(0).replace(" ", "")
    else:
        kind = None


    h2 = html.find("h2", text="Versions")
    if h2:
        div = h2.findNext("div")
        links = div.findAll("a")
        try:
            country = (
                html.find("h2", text="Countries").findNext("div").findNext("div").text
            )
        except AttributeError:
            country = (
                html.find("h2", text="Corporate Author")
                .findNext("div")
                .findNext("div")
                .text
            )
        doctype = (
            html.find("h2", text="Document Type").findNext("div").findNext("div").text
        )
        for link in links:
            url = link.attrs["href"]
            if not kind:
                match = pattern.search(url.upper())
                if match:
                    kind = match.group(0)
                else:
                    if ("NIR" in doctype) or ("NIR" in title):
                        kind = "NIR"
                    elif "NC" in title:
                        kind = "NC"
            downloads.append(
                {
                    "Kind": kind,
                    "Country": country,
                    "Title": title,
                    "URL": url,
                }
            )
        print("\t".join([kind, country, title, url]))
    else:
        no_downloads.append((title, url))

if len(no_downloads) > 0:
    print("No downloads for ", no_downloads)

driver.close()
df = pd.DataFrame(downloads)
df = df[["Kind", "Country", "Title", "URL"]]
df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)