{ "cells": [ { "cell_type": "markdown", "id": "prescribed-green", "metadata": {}, "source": [ "# Pandas (5/5-2021)" ] }, { "cell_type": "markdown", "id": "weekly-playlist", "metadata": {}, "source": [ "## Exercise\n", "\n", "Load csv file with major cities from https://datahub.io/core/world-cities into a Pandas dataframe." ] }, { "cell_type": "code", "execution_count": null, "id": "flexible-walker", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "cities = pd.read_csv('world-cities_csv.csv')\n", "print(cities) # ascii printout" ] }, { "cell_type": "code", "execution_count": null, "id": "fantastic-reflection", "metadata": {}, "outputs": [], "source": [ "cities # nicely integrades with Jupyter" ] }, { "cell_type": "code", "execution_count": null, "id": "alike-ability", "metadata": {}, "outputs": [], "source": [ "cities['country'] # single column" ] }, { "cell_type": "code", "execution_count": null, "id": "tracked-speaker", "metadata": {}, "outputs": [], "source": [ "cities.country # single column, shorter" ] }, { "cell_type": "code", "execution_count": null, "id": "literary-boost", "metadata": {}, "outputs": [], "source": [ "cities[['name', 'country']] # multiple columns, note index is a list of columns" ] }, { "cell_type": "code", "execution_count": null, "id": "second-lotus", "metadata": {}, "outputs": [], "source": [ "cities[5400:5410] # can slice by row number" ] }, { "cell_type": "code", "execution_count": null, "id": "adapted-municipality", "metadata": {}, "outputs": [], "source": [ "cities[5409] # fails :-(, use .loc instead" ] }, { "cell_type": "code", "execution_count": null, "id": "educated-label", "metadata": {}, "outputs": [], "source": [ "cities.loc[10] # use loc to index by row index" ] }, { "cell_type": "code", "execution_count": null, "id": "gothic-graduation", "metadata": {}, "outputs": [], "source": [ "cities.loc[:10, ['name', 'country']] # slicing and extracting columns" ] }, { "cell_type": "code", "execution_count": null, "id": "gothic-induction", "metadata": {}, "outputs": [], "source": [ "cities.at[5409, 'name'] # use .at to look entry in table by row/column index" ] }, { "cell_type": "code", "execution_count": null, "id": "logical-denver", "metadata": {}, "outputs": [], "source": [ "cities.geonameid * 10 # can do operations on columns, like Numpy" ] }, { "cell_type": "code", "execution_count": null, "id": "artificial-baker", "metadata": {}, "outputs": [], "source": [ "cities['some_computed_value'] = cities.geonameid ** 3 # create new column (generates integer overflow!)\n", "cities" ] }, { "cell_type": "code", "execution_count": null, "id": "motivated-property", "metadata": {}, "outputs": [], "source": [ "cities = cities.drop(columns=['some_computed_value']) # remove columns again (returns new dataframe !)\n", "cities" ] }, { "cell_type": "code", "execution_count": null, "id": "personal-variation", "metadata": {}, "outputs": [], "source": [ "cities.country == 'Denmark'" ] }, { "cell_type": "code", "execution_count": null, "id": "disabled-passport", "metadata": {}, "outputs": [], "source": [ "print(list(cities.country == 'Denmark'))" ] }, { "cell_type": "code", "execution_count": null, "id": "honest-stationery", "metadata": {}, "outputs": [], "source": [ "from collections import Counter\n", "Counter(cities.country == 'Denmark')" ] }, { "cell_type": "code", "execution_count": null, "id": "indirect-makeup", "metadata": {}, "outputs": [], "source": [ "type(cities[['name', 'country']])" ] }, { "cell_type": "code", "execution_count": null, "id": "operating-stomach", "metadata": {}, "outputs": [], "source": [ "type(cities.country == 'Denmark')" ] }, { "cell_type": "code", "execution_count": null, "id": "frank-endorsement", "metadata": {}, "outputs": [], "source": [ "cities[cities.country == 'Denmark'] # similar to Numpy filtering" ] }, { "cell_type": "code", "execution_count": null, "id": "floral-minority", "metadata": {}, "outputs": [], "source": [ "DK = cities[cities.country == 'Denmark']" ] }, { "cell_type": "code", "execution_count": null, "id": "still-crown", "metadata": {}, "outputs": [], "source": [ "DK.sort_values('name')" ] }, { "cell_type": "code", "execution_count": null, "id": "martial-leeds", "metadata": {}, "outputs": [], "source": [ "DK.set_index('geonameid')" ] }, { "cell_type": "code", "execution_count": null, "id": "champion-latter", "metadata": {}, "outputs": [], "source": [ "DK[DK.name == 'Ã…rhus'] # note row number is the original row number" ] }, { "cell_type": "code", "execution_count": null, "id": "equipped-international", "metadata": { "scrolled": true }, "outputs": [], "source": [ "DK.loc[5300:5400] # note row index is original index into cities" ] }, { "cell_type": "code", "execution_count": null, "id": "textile-course", "metadata": {}, "outputs": [], "source": [ "DK.iloc[:10] # integer location, 10 first rows" ] }, { "cell_type": "code", "execution_count": null, "id": "pointed-violence", "metadata": {}, "outputs": [], "source": [ "cities.set_index('country')" ] }, { "cell_type": "code", "execution_count": null, "id": "synthetic-lawrence", "metadata": {}, "outputs": [], "source": [ "cities.set_index(['country', 'subcountry'])" ] }, { "cell_type": "code", "execution_count": null, "id": "secondary-albania", "metadata": {}, "outputs": [], "source": [ "cities.set_index(['country', 'subcountry']).loc['Denmark'].sort_values('name') # index multiple rows & sort" ] }, { "cell_type": "code", "execution_count": null, "id": "greatest-butler", "metadata": {}, "outputs": [], "source": [ "# index multiple rows & sort\n", "cities.set_index(['country', 'subcountry']).loc['Denmark'].sort_index(ascending=False) " ] }, { "cell_type": "code", "execution_count": null, "id": "smart-cheese", "metadata": {}, "outputs": [], "source": [ "geo = cities.set_index('geonameid')\n", "geo.loc[2624652, ['name', 'country']]" ] }, { "cell_type": "code", "execution_count": null, "id": "dated-tragedy", "metadata": {}, "outputs": [], "source": [ "geo = cities.set_index('geonameid')\n", "geo.loc[2620320:2624652, ['name', 'country']] # note that slices on .loc are INCLUSIVE" ] }, { "cell_type": "code", "execution_count": null, "id": "distinguished-drilling", "metadata": {}, "outputs": [], "source": [ "geo.at[2624652, 'name']" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 5 }