{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:16:12.688993Z",
     "iopub.status.busy": "2020-10-15T16:16:12.688809Z",
     "iopub.status.idle": "2020-10-15T16:16:12.691487Z",
     "shell.execute_reply": "2020-10-15T16:16:12.691055Z",
     "shell.execute_reply.started": "2020-10-15T16:16:12.688977Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xarray as xr\n",
    "import pint_xarray\n",
    "import pathlib\n",
    "from openscm_units import unit_registry as ureg\n",
    "import os\n",
    "import tqdm\n",
    "import zipfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:16:13.035946Z",
     "iopub.status.busy": "2020-10-15T16:16:13.035581Z",
     "iopub.status.idle": "2020-10-15T16:16:13.040967Z",
     "shell.execute_reply": "2020-10-15T16:16:13.040022Z",
     "shell.execute_reply.started": "2020-10-15T16:16:13.035910Z"
    }
   },
   "outputs": [],
   "source": [
    "inpath = pathlib.Path('../inputs/')\n",
    "outpath = pathlib.Path('../outputs/')\n",
    "\n",
    "# compressing output files with gzip yields a file size less than 1/3 of an uncompressed file\n",
    "compress_options = dict(zlib=True, complevel=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:16:13.325976Z",
     "iopub.status.busy": "2020-10-15T16:16:13.325225Z",
     "iopub.status.idle": "2020-10-15T16:16:13.334801Z",
     "shell.execute_reply": "2020-10-15T16:16:13.333160Z",
     "shell.execute_reply.started": "2020-10-15T16:16:13.325906Z"
    }
   },
   "outputs": [],
   "source": [
    "inzip = zipfile.ZipFile(inpath / 'Jeffery-et-al-2018-PRIMAP-crf-2017-v1.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:16:13.815747Z",
     "iopub.status.busy": "2020-10-15T16:16:13.814988Z",
     "iopub.status.idle": "2020-10-15T16:16:14.120500Z",
     "shell.execute_reply": "2020-10-15T16:16:14.119710Z",
     "shell.execute_reply.started": "2020-10-15T16:16:13.815673Z"
    }
   },
   "outputs": [],
   "source": [
    "# read via pandas\n",
    "csv_fd = inzip.open('Jeffery-et-al-2018-PRIMAP-crf-2017-v1/Jeffery-et-al-2018-PRIMAP-crf_2017-v1.csv', 'r')\n",
    "ds = pd.read_csv(csv_fd, skiprows=2).to_xarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:16:26.982561Z",
     "iopub.status.busy": "2020-10-15T16:16:26.981950Z",
     "iopub.status.idle": "2020-10-15T16:16:26.995034Z",
     "shell.execute_reply": "2020-10-15T16:16:26.994022Z",
     "shell.execute_reply.started": "2020-10-15T16:16:26.982505Z"
    }
   },
   "outputs": [],
   "source": [
    "# set indices, this will lead to a MultiIndex'ed ds; we will convert to normal ds later\n",
    "ds = ds.set_index({'index': ['country', 'category', 'entity']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:18:07.159983Z",
     "iopub.status.busy": "2020-10-15T16:18:07.159792Z",
     "iopub.status.idle": "2020-10-15T16:18:07.165712Z",
     "shell.execute_reply": "2020-10-15T16:18:07.165271Z",
     "shell.execute_reply.started": "2020-10-15T16:18:07.159967Z"
    }
   },
   "outputs": [],
   "source": [
    "# always the same\n",
    "del ds['version']\n",
    "\n",
    "# split unit information into own array, stack along date axis\n",
    "da_units = ds['unit']\n",
    "del ds['unit']\n",
    "da = ds.to_array('date')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:18:07.306345Z",
     "iopub.status.busy": "2020-10-15T16:18:07.306027Z",
     "iopub.status.idle": "2020-10-15T16:18:08.843706Z",
     "shell.execute_reply": "2020-10-15T16:18:08.843316Z",
     "shell.execute_reply.started": "2020-10-15T16:18:07.306316Z"
    }
   },
   "outputs": [],
   "source": [
    "# normalize units\n",
    "\n",
    "entity_metadata_map = {}\n",
    "\n",
    "for entity in ('FGASES', 'KYOTOGHG', 'HFCS', 'OTHERHFCS', 'OTHERPFCS', 'PFCS'):\n",
    "    entity_metadata_map[entity] = {\n",
    "        'entity': entity,\n",
    "        'unit entity': 'CO2',\n",
    "        'gwp conversions': 'SARGWP100',\n",
    "    }\n",
    "    entity_metadata_map[f'{entity}AR4'] = {\n",
    "        'entity': entity,\n",
    "        'unit entity': 'CO2',\n",
    "        'gwp conversions': 'AR4GWP100',\n",
    "    }\n",
    "    entity_metadata_map[f'{entity}AR5'] = {\n",
    "        'entity': entity,\n",
    "        'unit entity': 'CO2',\n",
    "        'gwp conversions': 'AR5GWP100',\n",
    "    }\n",
    "    entity_metadata_map[f'{entity}AR5CCF'] = {\n",
    "        'entity': entity,\n",
    "        'unit entity': 'CO2',\n",
    "        'gwp conversions': 'AR5CCFGWP100',\n",
    "    }\n",
    "\n",
    "unit_pretranslation = {\n",
    "    'GgCO2eq': 'Gg',\n",
    "    'MtCO2eq': 'Mt',\n",
    "}\n",
    "\n",
    "preferred_units = {\n",
    "    'CO2': 'Gg CO2 / year'\n",
    "}\n",
    "\n",
    "# will be used later to set the metadata\n",
    "entity_metadata = {}\n",
    "\n",
    "# now convert each entity to a single unit, normalizing the units to scmdata names\n",
    "for entity in np.unique(da['entity']):\n",
    "    \n",
    "    metadata = entity_metadata_map.get(entity, {'entity': entity, 'unit entity': entity})\n",
    "    \n",
    "    # normalize all units to scmdata\n",
    "    # because scmdata contains the entity in the unit, this is kind of complicated\n",
    "    units = np.unique(da_units.loc[{'entity': entity}])\n",
    "    \n",
    "    # translate units\n",
    "    for unit in units:\n",
    "        tr_unit = unit_pretranslation.get(unit, unit)\n",
    "        \n",
    "        scm_unit = f'{tr_unit} {metadata[\"unit entity\"]} / year'\n",
    "        \n",
    "        da_units.loc[{'index': (da_units == unit) & (da['entity'] == entity)}] = scm_unit\n",
    "    \n",
    "    # convert all units to a single unit\n",
    "    units = sorted(np.unique(da_units.loc[{'entity': entity}]))\n",
    "    \n",
    "    normal_unit = preferred_units.get(metadata['unit entity'], units[0])\n",
    "    \n",
    "    for unit in units:\n",
    "        if unit == normal_unit:\n",
    "            continue\n",
    "        factor = ureg(unit).to(ureg(normal_unit)).magnitude\n",
    "        loc = {'index': (da_units == unit) & (da['entity'] == entity)}\n",
    "        da.loc[loc] *= factor\n",
    "        da_units.loc[loc] = normal_unit\n",
    "    \n",
    "    metadata['units'] = normal_unit\n",
    "    del metadata['unit entity']\n",
    "    entity_metadata[entity] = metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:18:08.844610Z",
     "iopub.status.busy": "2020-10-15T16:18:08.844471Z",
     "iopub.status.idle": "2020-10-15T16:18:10.727622Z",
     "shell.execute_reply": "2020-10-15T16:18:10.726972Z",
     "shell.execute_reply.started": "2020-10-15T16:18:08.844594Z"
    }
   },
   "outputs": [],
   "source": [
    "da = da.unstack().dropna('date', 'all')\n",
    "da['date'] = pd.to_datetime(da['date'].values, format='%Y')\n",
    "ds = da.to_dataset('entity')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:18:10.728568Z",
     "iopub.status.busy": "2020-10-15T16:18:10.728384Z",
     "iopub.status.idle": "2020-10-15T16:18:10.733089Z",
     "shell.execute_reply": "2020-10-15T16:18:10.732491Z",
     "shell.execute_reply.started": "2020-10-15T16:18:10.728544Z"
    }
   },
   "outputs": [],
   "source": [
    "for entity in ds.keys():\n",
    "    ds[entity].attrs = entity_metadata[entity]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-15T16:18:10.734074Z",
     "iopub.status.busy": "2020-10-15T16:18:10.733864Z",
     "iopub.status.idle": "2020-10-15T16:18:12.170363Z",
     "shell.execute_reply": "2020-10-15T16:18:12.169960Z",
     "shell.execute_reply.started": "2020-10-15T16:18:10.734044Z"
    }
   },
   "outputs": [],
   "source": [
    "encoding = {x: compress_options for x in ds}\n",
    "ds.to_netcdf(outpath / 'primap-crf-2017v1.nc', encoding=encoding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}