gjbex · gjbex · May 27, 2021 · May 26, 2021 · May 27, 2021 · May 27, 2021
diff --git a/python_for_data_science.pptx b/python_for_data_science.pptx
diff --git a/source-code/pandas/README.md b/source-code/pandas/README.md
@@ -23,4 +23,5 @@ easy to use.
 1. `patient_data.ipynb`: extended version of therunninng example used
     in the Python slides.
 1. `bokeh_plot.ipynb`: using Bokeh as a plotting beackdnd for pandas.
+1. `pipes.ipynb`: consolidating data processing using pipes.
 1. `screenshots`: screenshots made for the slides.
diff --git a/source-code/pandas/patient_data.ipynb b/source-code/pandas/patient_data.ipynb
diff --git a/source-code/pandas/pipes.ipynb b/source-code/pandas/pipes.ipynb
@@ -0,0 +1,262 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b8acc759-2d21-4ac1-a64a-e338fa7f516c",
+   "metadata": {},
+   "source": [
+    "# Requirements"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "09782f37-83d3-4670-a6b4-5030c7a0717d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75061e26-7b02-4955-a623-f085bdd461e0",
+   "metadata": {},
+   "source": [
+    "# Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10c99a56-bf6b-4135-8a89-44a8caa38d63",
+   "metadata": {},
+   "source": [
+    "Read the patient experiment data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55bc641a-bd06-4a5e-8d58-c8e177e62d89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_excel('data/patient_experiment.xlsx')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "267a9df3-f049-4ff9-96c8-d4af5d1da64c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 62 entries, 0 to 61\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column       Non-Null Count  Dtype         \n",
+      "---  ------       --------------  -----         \n",
+      " 0   patient      62 non-null     int64         \n",
+      " 1   dose         61 non-null     float64       \n",
+      " 2   date         62 non-null     datetime64[ns]\n",
+      " 3   temperature  61 non-null     float64       \n",
+      "dtypes: datetime64[ns](1), float64(2), int64(1)\n",
+      "memory usage: 2.1 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54f7cce5-d811-4def-a99d-53c2c5893126",
+   "metadata": {},
+   "source": [
+    "The first step is transforming the data into a time series."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "81c6819c-412f-4787-8e44-cef39b035ca4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_time_series(df):\n",
+    "    return df.pivot_table(index='date', columns=['patient'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2a9d1c9-d8a8-4df4-ba2a-0faf2250a11c",
+   "metadata": {},
+   "source": [
+    "Next, we should deal with missing data by interpolation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4c6e3606-cb4f-48a5-808f-1d4d65649ce1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def impute(df):\n",
+    "    return df.interpolate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e63b2af-4d79-49e6-9f93-bbd6966a60a6",
+   "metadata": {},
+   "source": [
+    "Finally, we compute the mean value of the temperatures across all patients for each time step.  Note that the name of the column is a parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1702df3c-65b1-45dd-8280-c87e300ac9a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_mean(df, column):\n",
+    "    df['avg_temp'] = df[column].mean(axis=1)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2fc4796e-fb31-4780-819e-c1680e3608cb",
+   "metadata": {},
+   "source": [
+    "All these operations can be chained using pipes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6609a625-c8ed-4695-8bd4-74d9cd7af05b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "time_series = data.pipe(create_time_series) \\\n",
+    "                  .pipe(impute) \\\n",
+    "                  .pipe(compute_mean, 'temperature')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3f6beb98-9d04-4d87-84f1-a7db0bbcd46c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "DatetimeIndex: 7 entries, 2012-10-02 10:00:00 to 2012-10-02 16:00:00\n",
+      "Data columns (total 19 columns):\n",
+      " #   Column            Non-Null Count  Dtype  \n",
+      "---  ------            --------------  -----  \n",
+      " 0   (dose, 1)         7 non-null      float64\n",
+      " 1   (dose, 2)         7 non-null      float64\n",
+      " 2   (dose, 3)         7 non-null      float64\n",
+      " 3   (dose, 4)         7 non-null      float64\n",
+      " 4   (dose, 5)         7 non-null      float64\n",
+      " 5   (dose, 6)         7 non-null      float64\n",
+      " 6   (dose, 7)         7 non-null      float64\n",
+      " 7   (dose, 8)         7 non-null      float64\n",
+      " 8   (dose, 9)         7 non-null      float64\n",
+      " 9   (temperature, 1)  7 non-null      float64\n",
+      " 10  (temperature, 2)  7 non-null      float64\n",
+      " 11  (temperature, 3)  7 non-null      float64\n",
+      " 12  (temperature, 4)  7 non-null      float64\n",
+      " 13  (temperature, 5)  7 non-null      float64\n",
+      " 14  (temperature, 6)  7 non-null      float64\n",
+      " 15  (temperature, 7)  7 non-null      float64\n",
+      " 16  (temperature, 8)  7 non-null      float64\n",
+      " 17  (temperature, 9)  7 non-null      float64\n",
+      " 18  (avg_temp, )      7 non-null      float64\n",
+      "dtypes: float64(19)\n",
+      "memory usage: 1.1 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "time_series.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dced61d-d65a-4e5b-9698-545d742d384a",
+   "metadata": {},
+   "source": [
+    "The original dataframe is unchanged."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "42e86dc9-5dea-4472-8241-159200c55dac",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 62 entries, 0 to 61\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column       Non-Null Count  Dtype         \n",
+      "---  ------       --------------  -----         \n",
+      " 0   patient      62 non-null     int64         \n",
+      " 1   dose         61 non-null     float64       \n",
+      " 2   date         62 non-null     datetime64[ns]\n",
+      " 3   temperature  61 non-null     float64       \n",
+      "dtypes: datetime64[ns](1), float64(2), int64(1)\n",
+      "memory usage: 2.1 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77ff0a74-c286-412f-8ea9-6df113efd4fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}