Forráskód Böngészése

script to convert the data to xarray

Mika Pflüger 4 éve
szülő
commit
046b1833cf
2 módosított fájl, 594 hozzáadás és 0 törlés
  1. 297 0
      code/.ipynb_checkpoints/read_2017v1-checkpoint.ipynb
  2. 297 0
      code/read_2017v1.ipynb

+ 297 - 0
code/.ipynb_checkpoints/read_2017v1-checkpoint.ipynb

@@ -0,0 +1,297 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:12.688993Z",
+     "iopub.status.busy": "2020-10-15T16:16:12.688809Z",
+     "iopub.status.idle": "2020-10-15T16:16:12.691487Z",
+     "shell.execute_reply": "2020-10-15T16:16:12.691055Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:12.688977Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import xarray as xr\n",
+    "import pint_xarray\n",
+    "import pathlib\n",
+    "from openscm_units import unit_registry as ureg\n",
+    "import os\n",
+    "import tqdm\n",
+    "import zipfile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:13.035946Z",
+     "iopub.status.busy": "2020-10-15T16:16:13.035581Z",
+     "iopub.status.idle": "2020-10-15T16:16:13.040967Z",
+     "shell.execute_reply": "2020-10-15T16:16:13.040022Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:13.035910Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "inpath = pathlib.Path('../inputs/')\n",
+    "outpath = pathlib.Path('../outputs/')\n",
+    "\n",
+    "# compressing output files with gzip yields a file size less than 1/3 of an uncompressed file\n",
+    "compress_options = dict(zlib=True, complevel=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:13.325976Z",
+     "iopub.status.busy": "2020-10-15T16:16:13.325225Z",
+     "iopub.status.idle": "2020-10-15T16:16:13.334801Z",
+     "shell.execute_reply": "2020-10-15T16:16:13.333160Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:13.325906Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "inzip = zipfile.ZipFile(inpath / 'Jeffery-et-al-2018-PRIMAP-crf-2017-v1.zip')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:13.815747Z",
+     "iopub.status.busy": "2020-10-15T16:16:13.814988Z",
+     "iopub.status.idle": "2020-10-15T16:16:14.120500Z",
+     "shell.execute_reply": "2020-10-15T16:16:14.119710Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:13.815673Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# read via pandas\n",
+    "csv_fd = inzip.open('Jeffery-et-al-2018-PRIMAP-crf-2017-v1/Jeffery-et-al-2018-PRIMAP-crf_2017-v1.csv', 'r')\n",
+    "ds = pd.read_csv(csv_fd, skiprows=2).to_xarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:26.982561Z",
+     "iopub.status.busy": "2020-10-15T16:16:26.981950Z",
+     "iopub.status.idle": "2020-10-15T16:16:26.995034Z",
+     "shell.execute_reply": "2020-10-15T16:16:26.994022Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:26.982505Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# set indices, this will lead to a MultiIndex'ed ds; we will convert to normal ds later\n",
+    "ds = ds.set_index({'index': ['country', 'category', 'entity']})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:07.159983Z",
+     "iopub.status.busy": "2020-10-15T16:18:07.159792Z",
+     "iopub.status.idle": "2020-10-15T16:18:07.165712Z",
+     "shell.execute_reply": "2020-10-15T16:18:07.165271Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:07.159967Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# always the same\n",
+    "del ds['version']\n",
+    "\n",
+    "# split unit information into own array, stack along date axis\n",
+    "da_units = ds['unit']\n",
+    "del ds['unit']\n",
+    "da = ds.to_array('date')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:07.306345Z",
+     "iopub.status.busy": "2020-10-15T16:18:07.306027Z",
+     "iopub.status.idle": "2020-10-15T16:18:08.843706Z",
+     "shell.execute_reply": "2020-10-15T16:18:08.843316Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:07.306316Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# normalize units\n",
+    "\n",
+    "entity_metadata_map = {}\n",
+    "\n",
+    "for entity in ('FGASES', 'KYOTOGHG', 'HFCS', 'OTHERHFCS', 'OTHERPFCS', 'PFCS'):\n",
+    "    entity_metadata_map[entity] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'SARGWP100',\n",
+    "    }\n",
+    "    entity_metadata_map[f'{entity}AR4'] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'AR4GWP100',\n",
+    "    }\n",
+    "    entity_metadata_map[f'{entity}AR5'] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'AR5GWP100',\n",
+    "    }\n",
+    "    entity_metadata_map[f'{entity}AR5CCF'] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'AR5CCFGWP100',\n",
+    "    }\n",
+    "\n",
+    "unit_pretranslation = {\n",
+    "    'GgCO2eq': 'Gg',\n",
+    "    'MtCO2eq': 'Mt',\n",
+    "}\n",
+    "\n",
+    "preferred_units = {\n",
+    "    'CO2': 'Gg CO2 / year'\n",
+    "}\n",
+    "\n",
+    "# will be used later to set the metadata\n",
+    "entity_metadata = {}\n",
+    "\n",
+    "# now convert each entity to a single unit, normalizing the units to scmdata names\n",
+    "for entity in np.unique(da['entity']):\n",
+    "    \n",
+    "    metadata = entity_metadata_map.get(entity, {'entity': entity, 'unit entity': entity})\n",
+    "    \n",
+    "    # normalize all units to scmdata\n",
+    "    # because scmdata contains the entity in the unit, this is kind of complicated\n",
+    "    units = np.unique(da_units.loc[{'entity': entity}])\n",
+    "    \n",
+    "    # translate units\n",
+    "    for unit in units:\n",
+    "        tr_unit = unit_pretranslation.get(unit, unit)\n",
+    "        \n",
+    "        scm_unit = f'{tr_unit} {metadata[\"unit entity\"]} / year'\n",
+    "        \n",
+    "        da_units.loc[{'index': (da_units == unit) & (da['entity'] == entity)}] = scm_unit\n",
+    "    \n",
+    "    # convert all units to a single unit\n",
+    "    units = sorted(np.unique(da_units.loc[{'entity': entity}]))\n",
+    "    \n",
+    "    normal_unit = preferred_units.get(metadata['unit entity'], units[0])\n",
+    "    \n",
+    "    for unit in units:\n",
+    "        if unit == normal_unit:\n",
+    "            continue\n",
+    "        factor = ureg(unit).to(ureg(normal_unit)).magnitude\n",
+    "        loc = {'index': (da_units == unit) & (da['entity'] == entity)}\n",
+    "        da.loc[loc] *= factor\n",
+    "        da_units.loc[loc] = normal_unit\n",
+    "    \n",
+    "    metadata['units'] = normal_unit\n",
+    "    del metadata['unit entity']\n",
+    "    entity_metadata[entity] = metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:08.844610Z",
+     "iopub.status.busy": "2020-10-15T16:18:08.844471Z",
+     "iopub.status.idle": "2020-10-15T16:18:10.727622Z",
+     "shell.execute_reply": "2020-10-15T16:18:10.726972Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:08.844594Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "da = da.unstack().dropna('date', 'all')\n",
+    "da['date'] = pd.to_datetime(da['date'].values, format='%Y')\n",
+    "ds = da.to_dataset('entity')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:10.728568Z",
+     "iopub.status.busy": "2020-10-15T16:18:10.728384Z",
+     "iopub.status.idle": "2020-10-15T16:18:10.733089Z",
+     "shell.execute_reply": "2020-10-15T16:18:10.732491Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:10.728544Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "for entity in ds.keys():\n",
+    "    ds[entity].attrs = entity_metadata[entity]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:10.734074Z",
+     "iopub.status.busy": "2020-10-15T16:18:10.733864Z",
+     "iopub.status.idle": "2020-10-15T16:18:12.170363Z",
+     "shell.execute_reply": "2020-10-15T16:18:12.169960Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:10.734044Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "encoding = {x: compress_options for x in ds}\n",
+    "ds.to_netcdf(outpath / 'primap-crf-2017v1.nc', encoding=encoding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

+ 297 - 0
code/read_2017v1.ipynb

@@ -0,0 +1,297 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:12.688993Z",
+     "iopub.status.busy": "2020-10-15T16:16:12.688809Z",
+     "iopub.status.idle": "2020-10-15T16:16:12.691487Z",
+     "shell.execute_reply": "2020-10-15T16:16:12.691055Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:12.688977Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import xarray as xr\n",
+    "import pint_xarray\n",
+    "import pathlib\n",
+    "from openscm_units import unit_registry as ureg\n",
+    "import os\n",
+    "import tqdm\n",
+    "import zipfile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:13.035946Z",
+     "iopub.status.busy": "2020-10-15T16:16:13.035581Z",
+     "iopub.status.idle": "2020-10-15T16:16:13.040967Z",
+     "shell.execute_reply": "2020-10-15T16:16:13.040022Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:13.035910Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "inpath = pathlib.Path('../inputs/')\n",
+    "outpath = pathlib.Path('../outputs/')\n",
+    "\n",
+    "# compressing output files with gzip yields a file size less than 1/3 of an uncompressed file\n",
+    "compress_options = dict(zlib=True, complevel=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:13.325976Z",
+     "iopub.status.busy": "2020-10-15T16:16:13.325225Z",
+     "iopub.status.idle": "2020-10-15T16:16:13.334801Z",
+     "shell.execute_reply": "2020-10-15T16:16:13.333160Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:13.325906Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "inzip = zipfile.ZipFile(inpath / 'Jeffery-et-al-2018-PRIMAP-crf-2017-v1.zip')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:13.815747Z",
+     "iopub.status.busy": "2020-10-15T16:16:13.814988Z",
+     "iopub.status.idle": "2020-10-15T16:16:14.120500Z",
+     "shell.execute_reply": "2020-10-15T16:16:14.119710Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:13.815673Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# read via pandas\n",
+    "csv_fd = inzip.open('Jeffery-et-al-2018-PRIMAP-crf-2017-v1/Jeffery-et-al-2018-PRIMAP-crf_2017-v1.csv', 'r')\n",
+    "ds = pd.read_csv(csv_fd, skiprows=2).to_xarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:16:26.982561Z",
+     "iopub.status.busy": "2020-10-15T16:16:26.981950Z",
+     "iopub.status.idle": "2020-10-15T16:16:26.995034Z",
+     "shell.execute_reply": "2020-10-15T16:16:26.994022Z",
+     "shell.execute_reply.started": "2020-10-15T16:16:26.982505Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# set indices, this will lead to a MultiIndex'ed ds; we will convert to normal ds later\n",
+    "ds = ds.set_index({'index': ['country', 'category', 'entity']})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:07.159983Z",
+     "iopub.status.busy": "2020-10-15T16:18:07.159792Z",
+     "iopub.status.idle": "2020-10-15T16:18:07.165712Z",
+     "shell.execute_reply": "2020-10-15T16:18:07.165271Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:07.159967Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# always the same\n",
+    "del ds['version']\n",
+    "\n",
+    "# split unit information into own array, stack along date axis\n",
+    "da_units = ds['unit']\n",
+    "del ds['unit']\n",
+    "da = ds.to_array('date')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:07.306345Z",
+     "iopub.status.busy": "2020-10-15T16:18:07.306027Z",
+     "iopub.status.idle": "2020-10-15T16:18:08.843706Z",
+     "shell.execute_reply": "2020-10-15T16:18:08.843316Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:07.306316Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# normalize units\n",
+    "\n",
+    "entity_metadata_map = {}\n",
+    "\n",
+    "for entity in ('FGASES', 'KYOTOGHG', 'HFCS', 'OTHERHFCS', 'OTHERPFCS', 'PFCS'):\n",
+    "    entity_metadata_map[entity] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'SARGWP100',\n",
+    "    }\n",
+    "    entity_metadata_map[f'{entity}AR4'] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'AR4GWP100',\n",
+    "    }\n",
+    "    entity_metadata_map[f'{entity}AR5'] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'AR5GWP100',\n",
+    "    }\n",
+    "    entity_metadata_map[f'{entity}AR5CCF'] = {\n",
+    "        'entity': entity,\n",
+    "        'unit entity': 'CO2',\n",
+    "        'gwp conversions': 'AR5CCFGWP100',\n",
+    "    }\n",
+    "\n",
+    "unit_pretranslation = {\n",
+    "    'GgCO2eq': 'Gg',\n",
+    "    'MtCO2eq': 'Mt',\n",
+    "}\n",
+    "\n",
+    "preferred_units = {\n",
+    "    'CO2': 'Gg CO2 / year'\n",
+    "}\n",
+    "\n",
+    "# will be used later to set the metadata\n",
+    "entity_metadata = {}\n",
+    "\n",
+    "# now convert each entity to a single unit, normalizing the units to scmdata names\n",
+    "for entity in np.unique(da['entity']):\n",
+    "    \n",
+    "    metadata = entity_metadata_map.get(entity, {'entity': entity, 'unit entity': entity})\n",
+    "    \n",
+    "    # normalize all units to scmdata\n",
+    "    # because scmdata contains the entity in the unit, this is kind of complicated\n",
+    "    units = np.unique(da_units.loc[{'entity': entity}])\n",
+    "    \n",
+    "    # translate units\n",
+    "    for unit in units:\n",
+    "        tr_unit = unit_pretranslation.get(unit, unit)\n",
+    "        \n",
+    "        scm_unit = f'{tr_unit} {metadata[\"unit entity\"]} / year'\n",
+    "        \n",
+    "        da_units.loc[{'index': (da_units == unit) & (da['entity'] == entity)}] = scm_unit\n",
+    "    \n",
+    "    # convert all units to a single unit\n",
+    "    units = sorted(np.unique(da_units.loc[{'entity': entity}]))\n",
+    "    \n",
+    "    normal_unit = preferred_units.get(metadata['unit entity'], units[0])\n",
+    "    \n",
+    "    for unit in units:\n",
+    "        if unit == normal_unit:\n",
+    "            continue\n",
+    "        factor = ureg(unit).to(ureg(normal_unit)).magnitude\n",
+    "        loc = {'index': (da_units == unit) & (da['entity'] == entity)}\n",
+    "        da.loc[loc] *= factor\n",
+    "        da_units.loc[loc] = normal_unit\n",
+    "    \n",
+    "    metadata['units'] = normal_unit\n",
+    "    del metadata['unit entity']\n",
+    "    entity_metadata[entity] = metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:08.844610Z",
+     "iopub.status.busy": "2020-10-15T16:18:08.844471Z",
+     "iopub.status.idle": "2020-10-15T16:18:10.727622Z",
+     "shell.execute_reply": "2020-10-15T16:18:10.726972Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:08.844594Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "da = da.unstack().dropna('date', 'all')\n",
+    "da['date'] = pd.to_datetime(da['date'].values, format='%Y')\n",
+    "ds = da.to_dataset('entity')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:10.728568Z",
+     "iopub.status.busy": "2020-10-15T16:18:10.728384Z",
+     "iopub.status.idle": "2020-10-15T16:18:10.733089Z",
+     "shell.execute_reply": "2020-10-15T16:18:10.732491Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:10.728544Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "for entity in ds.keys():\n",
+    "    ds[entity].attrs = entity_metadata[entity]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-10-15T16:18:10.734074Z",
+     "iopub.status.busy": "2020-10-15T16:18:10.733864Z",
+     "iopub.status.idle": "2020-10-15T16:18:12.170363Z",
+     "shell.execute_reply": "2020-10-15T16:18:12.169960Z",
+     "shell.execute_reply.started": "2020-10-15T16:18:10.734044Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "encoding = {x: compress_options for x in ds}\n",
+    "ds.to_netcdf(outpath / 'primap-crf-2017v1.nc', encoding=encoding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}