{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Investigating Internet Shutdowns with Google Traffic stats\n",
    "\n",
    "Requirements:\n",
    "* numpy\n",
    "* pandas\n",
    "* requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests\n",
    "from datetime import datetime, timedelta\n",
    "import plotly.plotly as py\n",
    "import matplotlib.pyplot as plt\n",
    "import cufflinks as cf\n",
    "import statsmodels.api as sm\n",
    "cf.go_offline()\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The code below is needed to retrieve google transparency data.\n",
    "\n",
    "You can skip this section, just be sure to run it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# The functions below are a python port of the official gwt library.\n",
    "# see: https://github.com/gwtproject/gwt/blob/master/user/src/com/google/gwt/user/client/rpc/impl/AbstractSerializationStream.java\n",
    "\n",
    "def long_from_base64(value):\n",
    "    pos = 0\n",
    "    long_val = base64_value(value[pos])\n",
    "    pos += 1\n",
    "    while pos < len(value):\n",
    "        long_val <<= 6\n",
    "        long_val |= base64_value(value[pos])\n",
    "        pos += 1\n",
    "    return long_val;\n",
    "\n",
    "def long_to_base64(value):\n",
    "    # Convert to ints early to avoid need for long ops\n",
    "    low = value & 0xffffffff\n",
    "    high = value >> 32\n",
    "    sb = ''\n",
    "    hnz = False # Have Non zero\n",
    "    hnz, s = base64_digit((high >> 28) & 0xf, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((high >> 22) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((high >> 16) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((high >> 10) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((high >> 4) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    v = ((high & 0xf) << 2) | ((low >> 30) & 0x3)\n",
    "    hnz, s = base64_digit(v, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((low >> 24) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((low >> 18) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((low >> 12) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit((low >> 6) & 0x3f, hnz)\n",
    "    sb += s\n",
    "    hnz, s = base64_digit(low & 0x3f, hnz)\n",
    "    sb += s\n",
    "    return sb\n",
    "    \n",
    "def base64_digit(digit, have_non_zero):\n",
    "    if digit > 0:\n",
    "        have_non_zero = True\n",
    "    if have_non_zero:\n",
    "        if (digit < 26):\n",
    "            return (have_non_zero, chr(ord('A') + digit))\n",
    "        elif (digit < 52):\n",
    "            return (have_non_zero, chr(ord('a') + digit - 26))\n",
    "        elif (digit < 62):\n",
    "            return (have_non_zero, chr(ord('0') + digit - 52))\n",
    "        elif (digit == 62):\n",
    "            return (have_non_zero, '$')\n",
    "        else:\n",
    "            return (have_non_zero, '_')\n",
    "    return (have_non_zero, '')\n",
    "\n",
    "def base64_value(char):\n",
    "    # Assume digit is one of [A-Za-z0-9$_]\n",
    "    if (char >= 'A' and char <= 'Z'):\n",
    "        return ord(char) - ord('A')\n",
    "    # No need to check digit <= 'z'\n",
    "    if (char >= 'a'):\n",
    "        return ord(char) - ord('a') + 26\n",
    "    if (char >= '0' and char <= '9'):\n",
    "        return ord(char) - ord('0') + 52\n",
    "    if (char == '$'):\n",
    "        return 62\n",
    "    # digit == '_'\n",
    "    return 63\n",
    "assert long_to_base64(long_from_base64('U3Ay4uu')) == 'U3Ay4uu'\n",
    "\n",
    "def dt_to_gwt_b64(dt):\n",
    "    \"\"\"\n",
    "    Convert a date to gwt base64 format (string)\n",
    "    \"\"\"\n",
    "    epoch = datetime.utcfromtimestamp(0)\n",
    "    ms_since_epoch = int((dt - epoch).total_seconds()*1000)\n",
    "    return long_to_base64(ms_since_epoch)\n",
    "def gwt_b64_to_dt(s):\n",
    "    \"\"\"\n",
    "    Takes a gwt base64 format and makes it into a date\n",
    "    \"\"\"\n",
    "    ms_since_epoch = long_from_base64(s)\n",
    "    return datetime.utcfromtimestamp(ms_since_epoch/1000)\n",
    "\n",
    "def parse_time_series(response):\n",
    "    \"\"\"\n",
    "    This is an attempt at reversing their format.\n",
    "    Not all works and sometimes it breaks unexpectedly.\n",
    "    \"\"\"\n",
    "    series = {\n",
    "        'start_time': gwt_b64_to_dt(response[2]),\n",
    "        'end_time': gwt_b64_to_dt(response[0])\n",
    "    }\n",
    "    timestamps = []\n",
    "    values = []\n",
    "    count = None\n",
    "    idx = 5\n",
    "    while True:\n",
    "        value = response[idx]\n",
    "        dtype = response[idx+1]\n",
    "        if dtype == 13:\n",
    "            values.append(value)\n",
    "        elif dtype == 8:\n",
    "            count = value\n",
    "            idx += 2\n",
    "            break\n",
    "        idx += 2\n",
    "    offset = 0\n",
    "    while True:\n",
    "        #print response[offset+idx:offset+idx+10]\n",
    "        value = response[offset+idx]\n",
    "        dtype = response[offset+idx+1]\n",
    "        if value == -3:\n",
    "            # XXX this appears to happen when a value is missing\n",
    "            if len(timestamps) > 2:\n",
    "                timestamps.append(timestamps[-1] + (timestamps[-2] - timestamps[-1]))\n",
    "            offset += 1\n",
    "            continue\n",
    "        if dtype != 3:\n",
    "            break\n",
    "        #assert dtype == 3, \"dtype %s != 3\" % dtype\n",
    "        dt = gwt_b64_to_dt(value)\n",
    "        if dt < series['start_time']:\n",
    "            break\n",
    "        #print(dt)\n",
    "        timestamps.append(dt)\n",
    "        offset += 2\n",
    "\n",
    "    series['count'] = count\n",
    "    series['timestamps'] = timestamps\n",
    "    series['values'] = values[:len(timestamps)]\n",
    "    return series\n",
    "\n",
    "def get_time_ranges(start_time, end_time, days=7):\n",
    "    if end_time - start_time <= timedelta(days=days):\n",
    "        return [\n",
    "            (start_time, end_time)\n",
    "        ]\n",
    "    time_ranges = []\n",
    "    range_start = start_time\n",
    "    range_end = range_start + timedelta(days=days)\n",
    "    while range_end <= end_time:\n",
    "        time_ranges.append((range_start, range_end))\n",
    "        range_start = range_end\n",
    "        range_end += timedelta(days=days)\n",
    "    if range_end > end_time:\n",
    "        range_end = end_time\n",
    "    time_ranges.append((range_start, range_end))\n",
    "    return time_ranges\n",
    "\n",
    "GOOGLE_ALPHA_2_CODES = {'BD': 20, 'BE': 21, 'BF': 22, 'BG': 23, 'BA': 18, 'BB': 19, 'BM': 28, 'BN': 29, \n",
    "                        'BO': 30, 'BH': 24, 'BI': 25, 'BJ': 26, 'BT': 34, 'JM': 123, 'BW': 37, 'WS': 262, \n",
    "                        'BR': 32, 'BS': 33, 'JE': 122, 'BY': 38, 'BZ': 39, 'RU': 205, 'RW': 206, 'RS': 204, \n",
    "                        'TL': 237, 'RE': 202, 'TM': 238, 'TJ': 235, 'RO': 203, 'GU': 102, 'GT': 101, 'GR': 99, \n",
    "                        'GQ': 98, 'GP': 97, 'JP': 125, 'GY': 104, 'GG': 91, 'GF': 90, 'GE': 89, 'GD': 88, 'GB': 87, \n",
    "                        'GA': 86, 'SV': 225, 'GN': 96, 'GM': 95, 'GL': 94, 'GI': 93, 'GH': 92, 'OM': 184, 'TN': 239, \n",
    "                        'JO': 124, 'HR': 108, 'HT': 109, 'HU': 110, 'HK': 105, 'HN': 107, 'VE': 256, 'PR': 194, \n",
    "                        'PS': 195, 'PW': 197, 'PT': 196, 'PY': 198, 'IQ': 118, 'PA': 185, 'PF': 187, 'PG': 188, \n",
    "                        'PE': 186, 'PK': 190, 'PH': 189, 'PL': 191, 'PM': 192, 'ZM': 272, 'EE': 71, 'EG': 72, \n",
    "                        'ZA': 271, 'EC': 70, 'IT': 121, 'VN': 259, 'SB': 208, 'ET': 76, 'ZW': 274, 'SA': 207, \n",
    "                        'ES': 75, 'ME': 151, 'MD': 150, 'MG': 153, 'MA': 148, 'MC': 149, 'UZ': 253, 'MM': 157, \n",
    "                        'ML': 156, 'MO': 159, 'MN': 158, 'MH': 154, 'MK': 155, 'MU': 165, 'MT': 164, 'MW': 167, \n",
    "                        'MV': 166, 'MQ': 161, 'MP': 160, 'MS': 163, 'MR': 162, 'IM': 115, 'UG': 248, 'TZ': 246, \n",
    "                        'MY': 169, 'MX': 168, 'IL': 114, 'FR': 84, 'FI': 79, 'FJ': 80, 'FO': 83, 'NI': 176, \n",
    "                        'NL': 177, 'NO': 178, 'SO': 220, 'VU': 260, 'NC': 172, 'NE': 173, 'NF': 174, 'NG': 175, \n",
    "                        'NZ': 183, 'NP': 179, 'NR': 180, 'CK': 47, 'CI': 46, 'CH': 45, 'CO': 51, 'CN': 50, 'CM': 49, \n",
    "                        'CL': 48, 'CA': 40, 'CG': 44, 'CF': 43, 'CD': 42, 'CZ': 60, 'CY': 59, 'CR': 53, 'CV': 56, \n",
    "                        'CU': 55, 'SZ': 228, 'SY': 227, 'KG': 127, 'KE': 126, 'SR': 221, 'KI': 129, 'KH': 128, \n",
    "                        'KN': 131, 'ST': 223, 'SK': 216, 'KR': 133, 'SI': 214, 'SH': 213, 'KW': 134, 'SN': 219, \n",
    "                        'SL': 217, 'SC': 209, 'KZ': 136, 'KY': 135, 'SG': 212, 'SE': 211, 'SD': 210, 'DO': 67, \n",
    "                        'DM': 66, 'DJ': 64, 'DK': 65, 'VG': 257, 'DE': 62, 'YE': 268, 'DZ': 68, 'US': 251, 'UY': 252, \n",
    "                        'YT': 269, 'LB': 138, 'LC': 139, 'LA': 137, 'TV': 244, 'TW': 245, 'TT': 243, 'TR': 242, \n",
    "                        'LK': 141, 'LI': 140, 'LV': 146, 'TO': 240, 'LT': 144, 'LU': 145, 'LR': 142, 'LS': 143, \n",
    "                        'TH': 234, 'TG': 233, 'TD': 231, 'TC': 230, 'LY': 147, 'VC': 255, 'AE': 2, 'AD': 1, 'AG': 4, \n",
    "                        'AF': 3, 'AI': 5, 'VI': 258, 'IS': 120, 'IR': 119, 'AM': 7, 'AL': 6, 'AO': 9, 'AR': 11, \n",
    "                        'AU': 14, 'AT': 13, 'AW': 15, 'IN': 116, 'AX': 16, 'AZ': 17, 'IE': 113, 'ID': 112, 'UA': 247, \n",
    "                        'QA': 199, 'MZ': 170}\n",
    "def get_code_from_alpha2(alpha_2):\n",
    "    try:\n",
    "        return GOOGLE_ALPHA_2_CODES[alpha_2]\n",
    "    except:\n",
    "        raise RuntimeError(\"Country not supported\")\n",
    "\n",
    "def get_metrics_with_date(prod_code, region_code, start_time, end_time):\n",
    "    print(\"Getting metrics for %s - %s\" % (start_time, end_time))\n",
    "    headers = {\n",
    "        'x-client-data': 'CJG2yQEIprbJAQjBtskBCPucygEIqZ3KAQ==',\n",
    "        'x-gwt-module-base': 'https://www.google.com/transparencyreport/gwt/',\n",
    "        'x-gwt-permutation':'DFD0EBA544B633919D593657A1CFAC69',\n",
    "        'content-type': 'text/x-gwt-rpc; charset=UTF-8',\n",
    "        'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6'\n",
    "    }\n",
    "    data = ('7|0|11|https://www.google.com/transparencyreport/gwt/|A95F82F4A46F68F8F3518C8811783D00|'\n",
    "            'com.google.analysis.gblocked.traffic.frontend.shared.TrafficService|evaluateQuery|'\n",
    "            'com.google.analysis.gblocked.traffic.frontend.shared.TrafficRequest/1877719668|'\n",
    "            'com.google.analysis.gblocked.traffic.common.TimeSeries/3457781141|'\n",
    "            'com.google.analysis.gblocked.traffic.common.Logsource/3745169662|'\n",
    "            'com.google.i18n.identifiers.RegionCode/1527795405|'\n",
    "            'com.google.analysis.gblocked.traffic.frontend.shared.Zoom/2138134534|'\n",
    "            'com.google.analysis.gblocked.traffic.frontend.shared.Zoom$Source/3263501257|'\n",
    "            'java.util.Date/3385151746|'\n",
    "     '1|2|3|4|1|5|5|598|4|6|7|{prod_code}|8|{region_code}|9|10|1|11|{end_time}|11|{start_time}|'.format(\n",
    "        prod_code=prod_code,\n",
    "        region_code=region_code,\n",
    "        start_time=dt_to_gwt_b64(start_time),\n",
    "        end_time=dt_to_gwt_b64(end_time)\n",
    "    ))\n",
    "    r = requests.post('https://www.google.com/transparencyreport/gwt/trafficService', headers=headers, data=data)\n",
    "    return r\n",
    "def parse_response(data):\n",
    "    \"\"\"\n",
    "    Do our best to parse the response\n",
    "    \"\"\"\n",
    "    return json.loads(data[4:].replace(\"'\", '\"').replace('\\\\x', '\\\\u00'))\n",
    "def get_data_for_product(region_code, prod_code, start_time, end_time):\n",
    "    timestamps = []\n",
    "    values = []\n",
    "    for st, et in get_time_ranges(start_time, end_time):\n",
    "        r = get_metrics_with_date(prod_code, region_code, st, et)\n",
    "        response = parse_response(r.text)\n",
    "        ts = parse_time_series(response)\n",
    "        for idx, timestamp in enumerate(ts['timestamps']):\n",
    "            if timestamp < start_time or timestamp > end_time:\n",
    "                continue\n",
    "            timestamps.append(timestamp)\n",
    "            values.append(ts['values'][idx])\n",
    "    return timestamps, values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below are the functions you should actually be calling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "GOOGLE_PRODUCT_CODES = {\n",
    "    'Blogger': 1,\n",
    "    'Gmail': 2,\n",
    "    'Google Books': 4,\n",
    "    'Google Docs': 5,\n",
    "    'Google Earth': 6,\n",
    "    'Google Groups': 7,\n",
    "    'Google Images': 8,\n",
    "    'Google Maps': 9,\n",
    "    'Google Search': 12,\n",
    "    'Google Sites': 13,\n",
    "    'Google Spreadsheets': 14,\n",
    "    'Google Translate': 16,\n",
    "    'Google Videos': 17,\n",
    "    'Orkut': 18,\n",
    "    'Picasa Web Albums': 19,\n",
    "    'Traffic Graph': 0,\n",
    "    'YouTube': 21\n",
    "}\n",
    "def get_df_traffic_for_country(alpha_2, start_time, end_time=datetime.utcnow()):\n",
    "    \"\"\"\n",
    "    This function takes as input a country code as alpha2 and returns a pandas\n",
    "    dataframe with all the traffic data for it.\n",
    "    \"\"\"\n",
    "    result = {\n",
    "        'timestamps': None\n",
    "    }\n",
    "    region_code = get_code_from_alpha2(alpha_2)\n",
    "    for prod_name, prod_code in GOOGLE_PRODUCT_CODES.items():\n",
    "        if prod_code not in [2, 21, 9, 8, 12]:\n",
    "            # We only care about: Gmail, Youtube, Maps, Images, Search\n",
    "            continue\n",
    "        print(\"Getting %s - %s\" % (prod_name, alpha_2))\n",
    "        try:\n",
    "            timestamps, values = get_data_for_product(region_code, prod_code, start_time, end_time)\n",
    "            if result['timestamps']:\n",
    "                assert result['timestamps'] == timestamps\n",
    "            else:\n",
    "                result['timestamps'] = timestamps\n",
    "        except Exception as exc:\n",
    "            print(\"MISSING DATA\")\n",
    "            print(exc)\n",
    "            continue\n",
    "        result[prod_name] = values\n",
    "    df = pd.DataFrame(result)\n",
    "    df.drop_duplicates(subset='timestamps', keep='last', inplace=True)\n",
    "    df.set_index('timestamps', inplace=True)\n",
    "    return df.sort_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Finding evidence of shutdown in Ethiopia in 2016"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_et = get_df_traffic_for_country('ET', datetime(2016, 6, 1, 0, 0), datetime(2016, 8, 30, 0, 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use as a metric google search, that seems to be the most stationary metric for every country (apart from seasonality variations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "target_metric = 'Google Search'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = df_et.reset_index()[['Google Search', 'timestamps']]\n",
    "df.columns = ['y', 'ds']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The models require the following extra dependencies:\n",
    "* plotly\n",
    "* cufflinks\n",
    "\n",
    "For the Facebook Prophet based model:\n",
    "* cython\n",
    "* fbprophet (my branch: https://github.com/hellais/prophet/tree/feature/daily-seasonality)\n",
    "\n",
    "For the ARIMA model:\n",
    "* statsmodels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### fbprophet model\n",
    "\n",
    "This first model uses fbprophet an \"Automatic Forecasting Procedure\" developped by facebook. The paper to be used as reference is: https://facebookincubator.github.io/prophet/static/prophet_paper_20170113.pdf.\n",
    "\n",
    "In order to support daily periods you will have to use my branch of it available here: https://github.com/hellais/prophet/tree/feature/daily-seasonality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from fbprophet import Prophet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pm = Prophet(daily_seasonality=True)\n",
    "pm.fit(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "predictions = pm.predict(df[['ds', 'y']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "predictions.set_index('ds')[['y', 'yhat', 'yhat_lower', 'yhat_upper']].iplot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fbp_model = predictions.set_index('ds')[['y', 'yhat', 'yhat_lower', 'yhat_upper']]\n",
    "fbp_model['anomaly_factor'] = ((fbp_model['yhat'] - fbp_model['y'])/100)\n",
    "layout_fbp = {'shapes': []}\n",
    "anomalies = []\n",
    "anomaly_start = None\n",
    "trigger_factor = 0.4\n",
    "for dt, row in fbp_model.iterrows():\n",
    "    if row.anomaly_factor and row.anomaly_factor > trigger_factor:\n",
    "        if anomaly_start is None:\n",
    "            anomaly_start = dt\n",
    "    elif anomaly_start is not None:\n",
    "        anomalies.append((anomaly_start, dt))\n",
    "        anomaly_start = None\n",
    "for start, end in anomalies:\n",
    "    layout_fbp['shapes'].append({\n",
    "            'type': 'rect',\n",
    "            'xref': 'x',\n",
    "            'yref': 'paper',\n",
    "            'x0': start,\n",
    "            'y0': 0,\n",
    "            'x1': end,\n",
    "            'y1': 1,\n",
    "            'fillcolor': 'red',\n",
    "            'opacity': 0.5,\n",
    "            'line': {\n",
    "                'width': 0\n",
    "            }\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fbp_model.iplot(layout=layout_fbp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ARIMA based model\n",
    "We build a \"Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors model\" and test the actual value to the value of the model.\n",
    "We use as s to `seasonal_order` 48 since the period of the data is 48 (48 ticks per day)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "arimam = sm.tsa.SARIMAX(df.set_index('ds')['y'], order=(1,0,0), seasonal_order=(1,1,0,48)).fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "arima_model = df.set_index('ds')\n",
    "\n",
    "start_time = (arima_model.index[0] + timedelta(days=1)).strftime('%Y-%m-%d')\n",
    "end_time = arima_model.index[-1].strftime('%Y-%m-%d')\n",
    "arima_model['ypred'] = arimam.predict(start_time, end_time, dynamic=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we see an `anomaly_factor` that is greater than 0.4 (positive means it's a downrise), we will color that region red indicating a possible blackout event."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "arima_model['anomaly_factor'] = ((arima_model['ypred'] - arima_model['y'])/100)\n",
    "layout_arima = {'shapes': []}\n",
    "anomalies = []\n",
    "anomaly_start = None\n",
    "trigger_factor = 0.4\n",
    "for dt, row in arima_model.iterrows():\n",
    "    if row.anomaly_factor and row.anomaly_factor > trigger_factor:\n",
    "        if anomaly_start is None:\n",
    "            anomaly_start = dt\n",
    "    elif anomaly_start is not None:\n",
    "        anomalies.append((anomaly_start, dt))\n",
    "        anomaly_start = None\n",
    "for start, end in anomalies:\n",
    "    layout_arima['shapes'].append({\n",
    "            'type': 'rect',\n",
    "            'xref': 'x',\n",
    "            'yref': 'paper',\n",
    "            'x0': start,\n",
    "            'y0': 0,\n",
    "            'x1': end,\n",
    "            'y1': 1,\n",
    "            'fillcolor': 'red',\n",
    "            'opacity': 0.5,\n",
    "            'line': {\n",
    "                'width': 0\n",
    "            }\n",
    "        })"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Plot it!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "arima_model.iplot(layout=layout_arima)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
