{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "prescribed-green",
   "metadata": {},
   "source": [
    "# Pandas (5/5-2021)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "weekly-playlist",
   "metadata": {},
   "source": [
    "## Exercise\n",
    "\n",
    "Load csv file with major cities from https://datahub.io/core/world-cities into a Pandas dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "flexible-walker",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "cities = pd.read_csv('world-cities_csv.csv')\n",
    "print(cities)  # ascii printout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fantastic-reflection",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities  # nicely integrades with Jupyter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "alike-ability",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities['country']   # single column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "tracked-speaker",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.country  # single column, shorter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "literary-boost",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities[['name', 'country']]  # multiple columns, note index is a list of columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "second-lotus",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities[5400:5410]  # can slice by row number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adapted-municipality",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities[5409]  # fails :-(, use .loc instead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "educated-label",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.loc[10]  # use loc to index by row index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gothic-graduation",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.loc[:10, ['name', 'country']]  # slicing and extracting columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gothic-induction",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.at[5409, 'name']  # use .at to look entry in table by row/column index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "logical-denver",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.geonameid * 10  # can do operations on columns, like Numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "artificial-baker",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities['some_computed_value'] = cities.geonameid ** 3  # create new column (generates integer overflow!)\n",
    "cities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "motivated-property",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities = cities.drop(columns=['some_computed_value'])  # remove columns again (returns new dataframe !)\n",
    "cities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "personal-variation",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.country == 'Denmark'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "disabled-passport",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(list(cities.country == 'Denmark'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "honest-stationery",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "Counter(cities.country == 'Denmark')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "indirect-makeup",
   "metadata": {},
   "outputs": [],
   "source": [
    "type(cities[['name', 'country']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "operating-stomach",
   "metadata": {},
   "outputs": [],
   "source": [
    "type(cities.country == 'Denmark')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "frank-endorsement",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities[cities.country == 'Denmark']  # similar to Numpy filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "floral-minority",
   "metadata": {},
   "outputs": [],
   "source": [
    "DK = cities[cities.country == 'Denmark']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "still-crown",
   "metadata": {},
   "outputs": [],
   "source": [
    "DK.sort_values('name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "martial-leeds",
   "metadata": {},
   "outputs": [],
   "source": [
    "DK.set_index('geonameid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "champion-latter",
   "metadata": {},
   "outputs": [],
   "source": [
    "DK[DK.name == 'Århus']  # note row number is the original row number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "equipped-international",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "DK.loc[5300:5400]  # note row index is original index into cities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "textile-course",
   "metadata": {},
   "outputs": [],
   "source": [
    "DK.iloc[:10]  # integer location, 10 first rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "pointed-violence",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.set_index('country')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "synthetic-lawrence",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.set_index(['country', 'subcountry'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "secondary-albania",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities.set_index(['country', 'subcountry']).loc['Denmark'].sort_values('name')  # index multiple rows & sort"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "greatest-butler",
   "metadata": {},
   "outputs": [],
   "source": [
    "# index multiple rows & sort\n",
    "cities.set_index(['country', 'subcountry']).loc['Denmark'].sort_index(ascending=False)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "smart-cheese",
   "metadata": {},
   "outputs": [],
   "source": [
    "geo = cities.set_index('geonameid')\n",
    "geo.loc[2624652, ['name', 'country']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dated-tragedy",
   "metadata": {},
   "outputs": [],
   "source": [
    "geo = cities.set_index('geonameid')\n",
    "geo.loc[2620320:2624652, ['name', 'country']]  # note that slices on .loc are INCLUSIVE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "distinguished-drilling",
   "metadata": {},
   "outputs": [],
   "source": [
    "geo.at[2624652, 'name']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}