9. A complete example: extracting data from the whole world¶
This notebook covers all the steps involved in a typical data collection using Facebook Marketing API data:
Before the collection starts: we start by the very beginning by acquiring the FB Ids and shapefiles for the locations we are interested in.
Data Collection: This step is where the primary data collection happens and might take weeks to finish.
Post-processing The Collection: After the data collection happened, we need to post-process the data to create a clean CSV file for data analysis.
Maps: We finally plot a map with the collection outcome.
9.1. Before the collection starts¶
We load here all necessary libraries used in this script. Feel free to comment out some of them if you are interested in only a part of this script.
import os
import io
import base64
import shutil
import pandas as pd
from pysocialwatcher.utils import double_country_conversion
from pysocialwatcher import watcherAPI
from pysocialwatcher.json_builder import JSONBuilder, AgeList, Age, Genders, get_predefined_behavior, LocationList
from pysocialwatcher import post_process
import json
# For the map
import matplotlib.pyplot as plt
import folium
from folium import plugins
from folium.features import GeoJson, GeoJsonTooltip, GeoJsonPopup
from branca.colormap import linear
import numpy as np
import branca
import uuid
import geopandas as gpd
from shapely import wkt
We start by using pySocialWatcher to download all shapefiles and country codes from a given list of locations.
# Load pySocialWatcher and the credentials from a file
watcher = watcherAPI(api_version="9.0", sleep_time=5)
watcher.load_credentials_file("credentials.csv")
country_list = ['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda', 'Albania', 'Armenia', 'Angola', 'Argentina', 'American Samoa',
'Austria', 'Australia', 'Aruba', 'Azerbaijan', 'Bosnia and Herzegovina', 'Barbados', 'Bangladesh', 'Belgium', 'Burkina Faso', 'Bulgaria',
'Bahrain', 'Burundi', 'Benin', 'Bermuda', 'Brunei', 'Bolivia', 'Brazil', 'Bahamas', 'Bhutan', 'Botswana', 'Belarus', 'Belize', 'Canada',
'Congo Dem. Rep.', 'Central African Republic', 'Congo Rep.', 'Switzerland', "Cote d'Ivoire", 'Cook Islands', 'Chile', 'Cameroon', 'China',
'Colombia', 'Costa Rica', 'Cape Verde', 'Curacao', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Denmark', 'Dominica', 'Dominican Republic',
'Algeria', 'Ecuador', 'Estonia', 'Egypt', 'Western Sahara', 'Eritrea', 'Spain', 'Ethiopia', 'Finland', 'Fiji', 'Falkland Islands', 'Micronesia',
'Faroe Islands', 'France', 'Gabon', 'United Kingdom', 'Grenada', 'Georgia', 'French Guiana', 'Guernsey', 'Ghana', 'Gibraltar', 'Greenland', 'Gambia',
'Guinea-Bissau', 'Guadeloupe', 'Equatorial Guinea', 'Greece', 'Guatemala', 'Guam', 'Guinea', 'Guyana', 'Hong Kong', 'Honduras', 'Croatia', 'Haiti',
'Hungary', 'Indonesia', 'Ireland', 'Israel', 'Isle of Man', 'India', 'Iraq', 'Iceland', 'Italy', 'Jersey', 'Jamaica', 'Jordan', 'Japan', 'Kenya',
'Kyrgyzstan', 'Cambodia', 'Kiribati', 'Comoros', 'Saint Kitts and Nevis', 'South Korea', 'Kuwait', 'Cayman Islands', 'Kazakhstan', 'Laos', 'Lebanon',
'Saint Lucia', 'Liechtenstein', 'Sri Lanka', 'Liberia', 'Lesotho', 'Lithuania', 'Luxembourg', 'Latvia', 'Libya', 'Morocco', 'Monaco', 'Moldova',
'Montenegro', 'Saint Martin', 'Madagascar', 'Marshall Islands', 'Macedonia', 'Mali', 'Myanmar', 'Mongolia', 'Macau', 'Northern Mariana Islands',
'Martinique', 'Mauritania', 'Montserrat', 'Malta', 'Mauritius', 'Maldives', 'Malawi', 'Mexico', 'Malaysia', 'Mozambique', 'Namibia', 'New Caledonia',
'Niger', 'Norfolk Island', 'Nigeria', 'Nicaragua', 'Netherlands', 'Norway', 'Nepal', 'Nauru', 'Niue', 'New Zealand', 'Oman', 'Panama', 'Peru',
'French Polynesia', 'Papua New Guinea', 'Philippines', 'Pakistan', 'Poland', 'Saint Pierre and Miquelon', 'Pitcairn', 'Puerto Rico', 'Palestine',
'Portugal', 'Palau', 'Paraguay', 'Qatar', 'Reunion', 'Romania', 'Serbia', 'Russia', 'Rwanda', 'Saudi Arabia', 'Solomon Islands', 'Seychelles', 'Sweden',
'Singapore', 'Saint Helena', 'Slovenia', 'Svalbard and Jan Mayen', 'Slovakia', 'Sierra Leone', 'San Marino', 'Senegal', 'Somalia', 'Suriname', 'South Sudan',
'Sao Tome and Principe', 'El Salvador', 'Sint Maarten', 'Swaziland', 'Turks and Caicos Islands', 'Chad', 'Togo', 'Thailand', 'Tajikistan', 'Tokelau',
'Timor-Leste', 'Turkmenistan', 'Tunisia', 'Tonga', 'Turkey', 'Trinidad and Tobago', 'Tuvalu', 'Taiwan', 'Tanzania', 'Ukraine', 'Uganda', 'United States',
'Uruguay', 'Uzbekistan', 'Saint Vincent and the Grenadines', 'Venezuela', 'British Virgin Islands', 'US Virgin Islands', 'Vietnam', 'Vanuatu',
'Wallis and Futuna', 'Samoa', 'Kosovo', 'Yemen', 'Mayotte', 'South Africa', 'Zambia', 'Zimbabwe']
df_tmp = []
for country in country_list:
# Get the 2-letters code for a given country: United States -> US
country_code = double_country_conversion(country)
print("Downloading %s (%s)" % (country, country_code))
df_country = watcherAPI.get_KML_given_geolocation("countries", [country_code])
df_tmp.append(df_country)
df = pd.concat(df_tmp)
df.head(3)
Downloading Andorra (AD)
Downloading United Arab Emirates (AE)
Downloading Afghanistan (AF)
Downloading Antigua and Barbuda (AG)
Downloading Albania (AL)
Downloading Armenia (AM)
Downloading Angola (AO)
Downloading Argentina (AR)
Downloading American Samoa (AS)
Downloading Austria (AT)
Downloading Australia (AU)
Downloading Aruba (AW)
Downloading Azerbaijan (AZ)
Downloading Bosnia and Herzegovina (BA)
Downloading Barbados (BB)
Downloading Bangladesh (BD)
Downloading Belgium (BE)
Downloading Burkina Faso (BF)
Downloading Bulgaria (BG)
Downloading Bahrain (BH)
Downloading Burundi (BI)
Downloading Benin (BJ)
Downloading Bermuda (BM)
Downloading Brunei (BN)
Downloading Bolivia (BO)
Downloading Brazil (BR)
Downloading Bahamas (BS)
Downloading Bhutan (BT)
Downloading Botswana (BW)
Downloading Belarus (BY)
Downloading Belize (BZ)
Downloading Canada (CA)
Downloading Congo Dem. Rep. (CD)
Downloading Central African Republic (CF)
Downloading Congo Rep. (CG)
Downloading Switzerland (CH)
Downloading Cote d'Ivoire (CI)
Downloading Cook Islands (CK)
Downloading Chile (CL)
Downloading Cameroon (CM)
Downloading China (CN)
Downloading Colombia (CO)
Downloading Costa Rica (CR)
Downloading Cape Verde (CV)
Downloading Curacao (CW)
Downloading Cyprus (CY)
Downloading Czech Republic (CZ)
Downloading Germany (DE)
Downloading Djibouti (DJ)
Downloading Denmark (DK)
Downloading Dominica (DM)
Downloading Dominican Republic (DO)
Downloading Algeria (DZ)
Downloading Ecuador (EC)
Downloading Estonia (EE)
Downloading Egypt (EG)
Downloading Western Sahara (EH)
Downloading Eritrea (ER)
Downloading Spain (ES)
Downloading Ethiopia (ET)
Downloading Finland (FI)
Downloading Fiji (FJ)
Downloading Falkland Islands (FK)
Downloading Micronesia (FM)
Downloading Faroe Islands (FO)
Downloading France (FR)
Downloading Gabon (GA)
Downloading United Kingdom (GB)
Downloading Grenada (GD)
Downloading Georgia (GE)
Downloading French Guiana (GF)
Downloading Guernsey (GG)
Downloading Ghana (GH)
Downloading Gibraltar (GI)
Downloading Greenland (GL)
Downloading Gambia (GM)
Downloading Guinea-Bissau (GN)
Downloading Guadeloupe (GP)
Downloading Equatorial Guinea (GQ)
Downloading Greece (GR)
Downloading Guatemala (GT)
Downloading Guam (GU)
Downloading Guinea (GW)
Downloading Guyana (GY)
Downloading Hong Kong (HK)
Downloading Honduras (HN)
Downloading Croatia (HR)
Downloading Haiti (HT)
Downloading Hungary (HU)
Downloading Indonesia (ID)
Downloading Ireland (IE)
Downloading Israel (IL)
Downloading Isle of Man (IM)
Downloading India (IN)
Downloading Iraq (IQ)
Downloading Iceland (IS)
Downloading Italy (IT)
Downloading Jersey (JE)
Downloading Jamaica (JM)
Downloading Jordan (JO)
Downloading Japan (JP)
Downloading Kenya (KE)
Downloading Kyrgyzstan (KG)
Downloading Cambodia (KH)
Downloading Kiribati (KI)
Downloading Comoros (KM)
Downloading Saint Kitts and Nevis (KN)
Downloading South Korea (KR)
Downloading Kuwait (KW)
Downloading Cayman Islands (KY)
Downloading Kazakhstan (KZ)
Downloading Laos (LA)
Downloading Lebanon (LB)
Downloading Saint Lucia (LC)
Downloading Liechtenstein (LI)
Downloading Sri Lanka (LK)
Downloading Liberia (LR)
Downloading Lesotho (LS)
Downloading Lithuania (LT)
Downloading Luxembourg (LU)
Downloading Latvia (LV)
Downloading Libya (LY)
Downloading Morocco (MA)
Downloading Monaco (MC)
Downloading Moldova (MD)
Downloading Montenegro (ME)
Downloading Saint Martin (MF)
Downloading Madagascar (MG)
Downloading Marshall Islands (MH)
Downloading Macedonia (MK)
Downloading Mali (ML)
Downloading Myanmar (MM)
Downloading Mongolia (MN)
Downloading Macau (MO)
Downloading Northern Mariana Islands (MP)
Downloading Martinique (MQ)
Downloading Mauritania (MR)
Downloading Montserrat (MS)
Downloading Malta (MT)
Downloading Mauritius (MU)
Downloading Maldives (MV)
Downloading Malawi (MW)
Downloading Mexico (MX)
Downloading Malaysia (MY)
Downloading Mozambique (MZ)
Downloading Namibia (NA)
Downloading New Caledonia (NC)
Downloading Niger (NE)
Downloading Norfolk Island (NF)
Downloading Nigeria (NG)
Downloading Nicaragua (NI)
Downloading Netherlands (NL)
Downloading Norway (NO)
Downloading Nepal (NP)
Downloading Nauru (NR)
Downloading Niue (NU)
Downloading New Zealand (NZ)
Downloading Oman (OM)
Downloading Panama (PA)
Downloading Peru (PE)
Downloading French Polynesia (PF)
Downloading Papua New Guinea (PG)
Downloading Philippines (PH)
Downloading Pakistan (PK)
Downloading Poland (PL)
Downloading Saint Pierre and Miquelon (PM)
Downloading Pitcairn (PN)
Downloading Puerto Rico (PR)
Downloading Palestine (PS)
Downloading Portugal (PT)
Downloading Palau (PW)
Downloading Paraguay (PY)
Downloading Qatar (QA)
Downloading Reunion (RE)
Downloading Romania (RO)
Downloading Serbia (RS)
Downloading Russia (RU)
Downloading Rwanda (RW)
Downloading Saudi Arabia (SA)
Downloading Solomon Islands (SB)
Downloading Seychelles (SC)
Downloading Sweden (SE)
Downloading Singapore (SG)
Downloading Saint Helena (SH)
Downloading Slovenia (SI)
Downloading Svalbard and Jan Mayen (SJ)
Downloading Slovakia (SK)
Downloading Sierra Leone (SL)
Downloading San Marino (SM)
Downloading Senegal (SN)
Downloading Somalia (SO)
Downloading Suriname (SR)
Downloading South Sudan (SS)
Downloading Sao Tome and Principe (ST)
Downloading El Salvador (SV)
Downloading Sint Maarten (SX)
Downloading Swaziland (SZ)
Downloading Turks and Caicos Islands (TC)
Downloading Chad (TD)
Downloading Togo (TG)
Downloading Thailand (TH)
Downloading Tajikistan (TJ)
Downloading Tokelau (TK)
Downloading Timor-Leste (TL)
Downloading Turkmenistan (TM)
Downloading Tunisia (TN)
Downloading Tonga (TO)
Downloading Turkey (TR)
Downloading Trinidad and Tobago (TT)
Downloading Tuvalu (TV)
Downloading Taiwan (TW)
Downloading Tanzania (TZ)
Downloading Ukraine (UA)
Downloading Uganda (UG)
Downloading United States (US)
Downloading Uruguay (UY)
Downloading Uzbekistan (UZ)
Downloading Saint Vincent and the Grenadines (VC)
Downloading Venezuela (VE)
Downloading British Virgin Islands (VG)
Downloading US Virgin Islands (VI)
Downloading Vietnam (VN)
Downloading Vanuatu (VU)
Downloading Wallis and Futuna (WF)
Downloading Samoa (WS)
Downloading Kosovo (XK)
Downloading Yemen (YE)
Downloading Mayotte (YT)
Downloading South Africa (ZA)
Downloading Zambia (ZM)
Downloading Zimbabwe (ZW)
name | kml | key | |
---|---|---|---|
0 | Andorra | <Polygon><outerBoundaryIs><LinearRing><coordin... | AD |
0 | United Arab Emirates | <Polygon><outerBoundaryIs><LinearRing><coordin... | AE |
0 | Afghanistan | <Polygon><outerBoundaryIs><LinearRing><coordin... | AF |
We take advantage of JSONBuilder here to genreate a JSON file for the data collection. Details can be found in our previous notebook.
# Location list given a dataframe
loclist = LocationList()
loclist.get_location_list_from_df(df)
# Age groups
ageList = AgeList()
ageList.add(Age(18, None))
ageList.add(Age(18, 24))
ageList.add(Age(25, 34))
ageList.add(Age(35, 54))
ageList.add(Age(55, None))
# Gender
genders = Genders(male=True, female=True, combined=True)
# Using the pre-defined behavior option of connectivity (which collects #users using Wifi, 2G, 3G, 4G)
connetivity = get_predefined_behavior(option="connectivity")
# Create the jsonbuilder object using what was defined before:
jsonb = JSONBuilder(name="LargeWorldCollection",
age_list=ageList,
location_list=loclist,
genders=genders,
behavior_groups=connetivity)
# Save json file for collection:
json_outputname = "countries_whole_world.json"
jsonb.jsonfy(json_outputname)
df["country_code"] = df["key"]
# Save geojson file to use it with Folium later on.
geojson_outname = "countries_whole_world.geojson"
watcherAPI.transform_KML_into_geojson(df, geojson_outname)
print("Created file %s" % geojson_outname)
Created file countries_whole_world.json.
Created file countries_whole_world.geojson
9.1.1. Simplifying the shapefiles/KMLs/geojsons:¶
Most of the time, simplifying the shapefiles makes the HTML created by Folium faster without losing precision. That happens because the KML/Geojson from Facebook might be too big to plot using Folium.
A straightforward way to still use it successfully is by simplifying the shapefile/KML/Geojson using the command line ogr2ogr from the GDAL package.
See https://gdal.org/download.html with instructions on how to download it for your OS.
If you have installed it on your Linux system, the following command can help to decrease the size of the shapefile:
if shutil.which("ogr2ogr") is not None: # check if this command is available first
oldname = geojson_outname
# if oldname is OLDNAME.extension, newname is OLDNAME_simplified.extension
newname = geojson_outname.rsplit('.')[0] + "_simplified." + geojson_outname.rsplit('.')[1]
command = "ogr2ogr -f geojson -simplify 0.01 %s %s" % (newname, oldname)
print("\033[1mRunning command:\033[0m\n%s" % command)
os.system(command)
print ('* Size of old file: {:,.1f}'.format(os.path.getsize(oldname)/float(1<<20))+" MB")
print ('* Size of simplified file: {:,.1f}'.format(os.path.getsize(newname)/float(1<<20))+" MB")
Running command:
ogr2ogr -f geojson -simplify 0.01 countries_whole_world_simplified.geojson countries_whole_world.geojson
* Size of old file: 61.6 MB
* Size of simplified file: 9.7 MB
9.2. Collecting data¶
This is a very large data collection with 17,325 API calls. There are several ways to speed it up:
Or simply by using multiple users/tokens at the same time: for this you just need to have multiple rows in the
credentials.csv
file
# This cell performs the collection. It might take several days.
watcher = watcherAPI(api_version="9.0", sleep_time=7, outputname="world_collection_psw.csv.gz")
df = watcher.run_data_collection("countries_whole_world.json", remove_tmp_files=True)
# Omitted for brevity
9.3. Post-processing Steps¶
Once the data collection is done, we can simply post-process it with the following commands:
df = pd.read_csv("./world_collection_psw.csv.gz")
df.head(5)
Unnamed: 0 | name | interests | ages_ranges | genders | behavior | scholarities | languages | family_statuses | relationship_statuses | ... | household_composition | all_fields | targeting | response | dau_audience | mau_audience | access_device | timestamp | publisher_platforms | mock_response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | LargeWorldCollection | NaN | {'min': 18} | 0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | (('ages_ranges', {'min': 18}), ('genders', 0),... | {'geo_locations': {'countries': ['AD'], 'locat... | b'{"data":[{"daily_outcomes_curve":[{"spend":0... | 0 | 1000 | {'name': '2G', 'or': [6017253486583]} | 1612206421 | ["facebook"] | False |
1 | 1 | LargeWorldCollection | NaN | {'min': 18} | 0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | (('ages_ranges', {'min': 18}), ('genders', 0),... | {'geo_locations': {'countries': ['AE'], 'locat... | b'{"data":[{"daily_outcomes_curve":[{"spend":0... | 1124 | 2500 | {'name': '2G', 'or': [6017253486583]} | 1612206421 | ["facebook"] | False |
2 | 2 | LargeWorldCollection | NaN | {'min': 18} | 0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | (('ages_ranges', {'min': 18}), ('genders', 0),... | {'geo_locations': {'countries': ['AF'], 'locat... | b'{"data":[{"daily_outcomes_curve":[{"spend":0... | 36418 | 120000 | {'name': '2G', 'or': [6017253486583]} | 1612206421 | ["facebook"] | False |
3 | 3 | LargeWorldCollection | NaN | {'min': 18} | 0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | (('ages_ranges', {'min': 18}), ('genders', 0),... | {'geo_locations': {'countries': ['AG'], 'locat... | b'{"data":[{"daily_outcomes_curve":[{"spend":0... | 994 | 1000 | {'name': '2G', 'or': [6017253486583]} | 1612206421 | ["facebook"] | False |
4 | 4 | LargeWorldCollection | NaN | {'min': 18} | 0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | (('ages_ranges', {'min': 18}), ('genders', 0),... | {'geo_locations': {'countries': ['AL'], 'locat... | b'{"data":[{"daily_outcomes_curve":[{"spend":0... | 399 | 1000 | {'name': '2G', 'or': [6017253486583]} | 1612206421 | ["facebook"] | False |
5 rows × 21 columns
processed_df = post_process.post_process_df_collection(df)
processed_df.head(4)[["LocationType", "FullLocation", "Gender", "Ages", "Education"]]
LocationType | FullLocation | Gender | Ages | Education | |
---|---|---|---|---|---|
0 | country | AD | both | 18- | AllDegrees |
1 | country | AE | both | 18- | AllDegrees |
2 | country | AF | both | 18- | AllDegrees |
3 | country | AG | both | 18- | AllDegrees |
cols_to_combine = ["Gender", "Ages", "Device"]
combo_df = post_process.combine_cols(processed_df, cols_to_combine)
combo_df = combo_df.pivot(index="Key", columns="combo", values="mau_audience").reset_index()
combo_df.head()
combo | Key | both_18-24_2G | both_18-24_3G | both_18-24_4G | both_18-24_AllDevices | both_18-24_Wifi | both_18-_2G | both_18-_3G | both_18-_4G | both_18-_AllDevices | ... | male_35-54_2G | male_35-54_3G | male_35-54_4G | male_35-54_AllDevices | male_35-54_Wifi | male_55-_2G | male_55-_3G | male_55-_4G | male_55-_AllDevices | male_55-_Wifi |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AD | 1000 | 1000 | 1000 | 2900 | 1500 | 1000 | 1000 | 1000 | 39000 | ... | 1000 | 1000 | 1000 | 8900 | 5300 | 1000 | 1000 | 1000 | 3900 | 2200 |
1 | AE | 1000 | 13000 | 140000 | 1100000 | 700000 | 2500 | 79000 | 960000 | 7800000 | ... | 1000 | 18000 | 270000 | 1900000 | 1200000 | 1000 | 2100 | 24000 | 200000 | 130000 |
2 | AF | 47000 | 910000 | 120000 | 1500000 | 69000 | 120000 | 2300000 | 320000 | 3900000 | ... | 6300 | 180000 | 34000 | 330000 | 35000 | 8800 | 80000 | 8800 | 140000 | 9600 |
3 | AG | 1000 | 1000 | 1800 | 9300 | 4300 | 1000 | 4000 | 12000 | 55000 | ... | 1000 | 1000 | 2200 | 9300 | 4100 | 1000 | 1000 | 1000 | 3400 | 1700 |
4 | AL | 1000 | 4000 | 22000 | 240000 | 120000 | 1000 | 24000 | 120000 | 1200000 | ... | 1000 | 5000 | 24000 | 240000 | 140000 | 1000 | 3400 | 12000 | 110000 | 71000 |
5 rows × 76 columns
9.4. Maps¶
Last, we use Folium to create a visualization.
The first thing that we do is loading the simplified geojson file that we created at the beginning of this notebook.
data = json.load(open("./countries_whole_world_simplified.geojson"))
kml_df = gpd.GeoDataFrame.from_features(data, crs='EPSG:4326')
kml_df = kml_df.rename(columns={"key": "Key"})
# :::important::: you need to merge like below. If you try to use pd.merge(df1, df2), it wont work!
map_df = kml_df.merge(combo_df)
map_df.head(2)
geometry | name | Key | country | both_18-24_2G | both_18-24_3G | both_18-24_4G | both_18-24_AllDevices | both_18-24_Wifi | both_18-_2G | ... | male_35-54_2G | male_35-54_3G | male_35-54_4G | male_35-54_AllDevices | male_35-54_Wifi | male_55-_2G | male_55-_3G | male_55-_4G | male_55-_AllDevices | male_55-_Wifi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POLYGON ((1.40970 42.48709, 1.44137 42.47546, ... | Andorra | AD | Andorra | 1000 | 1000 | 1000 | 2900 | 1500 | 1000 | ... | 1000 | 1000 | 1000 | 8900 | 5300 | 1000 | 1000 | 1000 | 3900 | 2200 |
1 | POLYGON ((56.37298 24.98056, 56.36980 24.97986... | United Arab Emirates | AE | United Arab Emirates | 1000 | 13000 | 140000 | 1100000 | 700000 | 2500 | ... | 1000 | 18000 | 270000 | 1900000 | 1200000 | 1000 | 2100 | 24000 | 200000 | 130000 |
2 rows × 79 columns
The next cells will load the files needed to process the HTML pop-up on the map.
Note that we are saving the plots into a folder called figures
. This largely helps to create a lighter HTML source code which will be much faster to load later on. The drawback is that we will need to move the folder figures
together with the HTML file created by Folium.
def getHTMLCombo(row, save_locally=False, folder="figures/"):
print("Processing..." + row["name"])
fb_18_24 = row["male_18-24_AllDevices"] + row["female_18-24_AllDevices"]
fb_25_34 = row["male_25-34_AllDevices"] + row["female_25-34_AllDevices"]
fb_35_54 = row["male_35-54_AllDevices"] + row["female_35-54_AllDevices"]
fb_55_ = row["male_55-_AllDevices"] + row["female_55-_AllDevices"]
total_pop_fb = float(row['both_18-_AllDevices'])
pie_fb_gender = getPie(["Male", "Female"], [row["male_18-_AllDevices"], row["female_18-_AllDevices"]], title="Genders (FB)", save_locally=save_locally, folder=folder)
pie_connectivity = getPie(["Wifi", "2G", "3G", "4G"], [row["both_18-_Wifi"], row["both_18-_2G"],
row["both_18-_3G"], row["both_18-_4G"],], title="Connectivity (All)", save_locally=save_locally, folder=folder)
pie_connectivity_male = getPie(["Wifi", "2G", "3G", "4G"], [row["male_18-_Wifi"], row["male_18-_2G"],
row["male_18-_3G"], row["male_18-_4G"]],
title="Connectivity (Male)", save_locally=save_locally, folder=folder)
pie_connectivity_female = getPie(["Wifi", "2G", "3G", "4G"], [row["female_18-_Wifi"], row["female_18-_2G"],
row["female_18-_3G"], row["female_18-_4G"],],
title="Connectivity (Female)", save_locally=save_locally, folder=folder)
pie_connectivity_18_24 = getPie(["Wifi", "2G", "3G", "4G"], [row["both_18-24_Wifi"], row["both_18-24_2G"],
row["both_18-24_3G"], row["both_18-24_4G"]],
title="Connectivity (18-24)", save_locally=save_locally, folder=folder)
pie_connectivity_25_34 = getPie(["Wifi", "2G", "3G", "4G"], [row["both_25-34_Wifi"], row["both_25-34_2G"],
row["both_25-34_3G"], row["both_25-34_4G"]],
title="Connectivity (25-34)", save_locally=save_locally, folder=folder)
pie_connectivity_35_54 = getPie(["Wifi", "2G", "3G", "4G"], [row["both_35-54_Wifi"], row["both_35-54_2G"],
row["both_35-54_3G"], row["both_35-54_4G"]],
title="Connectivity (35-54)", save_locally=save_locally, folder=folder)
pie_connectivity_55_ = getPie(["Wifi", "2G", "3G", "4G"], [row["both_55-_Wifi"], row["both_55-_2G"],
row["both_55-_3G"], row["both_55-_4G"]],
title="Connectivity (55+)", save_locally=save_locally, folder=folder)
pie_fb_age = getPie(["18-24", "25-34", "35-54", "55+"], [row["both_18-_AllDevices"], row["both_25-34_AllDevices"],
row["both_35-54_AllDevices"], row["both_55-_AllDevices"]], title="Ages (FB)", save_locally=save_locally, folder=folder)
pyramid_age = getPyramid(["18-24", "25-34", "35-54", "55+"], ["female", "male"],
[row["female_18-24_AllDevices"], row["female_25-34_AllDevices"], row["female_35-54_AllDevices"], row["female_55-_AllDevices"]],
[row["male_18-24_AllDevices"], row["male_25-34_AllDevices"], row["male_35-54_AllDevices"], row["male_55-_AllDevices"]],
save_locally=save_locally, folder=folder)
pyramid_age_perc = getPyramid(["18-24", "25-34", "35-54", "55+"], ["female", "male"],
[row["female_18-24_AllDevices"], row["female_25-34_AllDevices"], row["female_35-54_AllDevices"], row["female_55-_AllDevices"]],
[row["male_18-24_AllDevices"], row["male_25-34_AllDevices"], row["male_35-54_AllDevices"], row["male_55-_AllDevices"]],
normalized=True, save_locally=save_locally, folder=folder)
#({lat:.1f}, {lng:.1f}) </h3> </br>
html = """
<h3> <b> Location: </b> <i> {name} </i>
<h5> <b>Total #Users (FB): </b> {total_pop_fb:,} </br> </h5>
<h5> <b> Gender Distribution </b> </h5>
<center>
<img src='{pie_fb_gender}'/>
</center>
<h5> <b> Age Distribution </b> </h5>
<center>
<img src='{pie_fb_age}'/>
</center>
<h5> <b> Age Distribution per Gender</b> </h5>
<center>
<img src='{pyramid_age}'/>
<img src='{pyramid_age_perc}'/>
</center>
<h5> <b> Connectivity according to FB </b> </h5>
<center>
<img src='{pie_connectivity}'/>
<img src='{pie_connectivity_male}'/>
<img src='{pie_connectivity_female}'/>
</center>
<h5> <b> Connectivity by Age </b> </h5>
<center>
<img src='{pie_connectivity_18_24}'/>
<img src='{pie_connectivity_25_34}'/>
<img src='{pie_connectivity_35_54}'/>
<img src='{pie_connectivity_55_}'/>
</center>
""".format(name=row["name"].encode('raw_unicode_escape'),
#lat=float(row["lat"]), lng=float(row["lng"]),
total_pop_fb=total_pop_fb,
pie_fb_gender=pie_fb_gender,
pie_fb_age=pie_fb_age,
pyramid_age=pyramid_age,
pyramid_age_perc=pyramid_age_perc,
pie_connectivity=pie_connectivity,
pie_connectivity_male=pie_connectivity_male,
pie_connectivity_female=pie_connectivity_female,
pie_connectivity_18_24=pie_connectivity_18_24,
pie_connectivity_25_34=pie_connectivity_25_34,
pie_connectivity_35_54=pie_connectivity_35_54,
pie_connectivity_55_=pie_connectivity_55_,
)
#iframe = branca.element.IFrame(html=html, width=500, height=300)
# TRY TO FORMAT WITH https://docs.python.org/3/library/string.html#format-specification-mini-language
return html
#return iframe
def getPie(labels, sizes, explode=None, title=None, save_locally=True, folder=None):
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
# Examples:
# labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
# sizes = [15, 30, 45, 10]
# explode = (0, 0.1, 0, 0) # only "explode" the 2nd slice (i.e. 'Hogs')
def label_formant(pct, allvals):
absolute = int(pct/100.*np.sum(allvals))
return "{:.1f}%\n({:,d})".format(pct, absolute)
fig1, ax1 = plt.subplots(figsize=(2,2))
ax1.pie(sizes, explode=explode, labels=labels,
autopct=lambda pct: label_formant(pct, sizes),
shadow=True, startangle=90, counterclock=False,
wedgeprops = {'linewidth' : 2, 'edgecolor': "black"}
)
if title:
ax1.set_title(title)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
if not save_locally:
img_buffer = io.BytesIO()
plt.savefig(img_buffer, format='png', transparent=True, bbox_inches="tight")
img_buffer.seek(0)
plt.close()
return "data:image/png;base64,%s" % base64.b64encode(img_buffer.getvalue()).decode('UTF-8')
else:
random_name = unique_filename = str(uuid.uuid4())
if folder:
path_name = os.path.join(folder, random_name)
else:
path_name = random_name
plt.savefig(path_name, format='png', transparent=True, bbox_inches="tight")
plt.close()
return path_name
def getPyramid(y, labels, data_left, data_right, normalized=False, save_locally=True, folder=None):
# E.g.:
# y = [0-18, 19-25, 26+]
# labels = [female, male]
# data_left = [1000, 2000, 3000]
# data_right = [2000, 5000, 1000]
data_left = np.array(data_left)
data_right = np.array(data_right)
if normalized:
data_left = data_left / data_left.sum()
data_right = data_right / data_right.sum()
assert data_left.shape == data_right.shape
N = range(0, data_left.shape[0])
fig1, ax1 = plt.subplots(figsize=(2,2))
ax1.barh(N, -data_left, label=labels[0])
ax1.barh(N, data_right, label=labels[1])
ax1.set(yticks=N, yticklabels=y)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(True)
ax1.spines["top"]._linewidth = 2
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(True)
ax1.spines['bottom']._linewidth = 2
ax1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
fancybox=True, shadow=True, ncol=5)
if not save_locally:
img_buffer = io.BytesIO()
plt.savefig(img_buffer, format='png', transparent=True, bbox_inches="tight")
img_buffer.seek(0)
plt.close()
return "data:image/png;base64,%s" % base64.b64encode(img_buffer.getvalue()).decode('UTF-8')
else:
random_name = unique_filename = str(uuid.uuid4())
if folder:
path_name = os.path.join(folder, random_name)
else:
path_name = random_name
plt.savefig(path_name, format='png', transparent=True, bbox_inches="tight")
plt.close()
return path_name
We then apply the getHTMLCombo function to each row in the dataframe.
To speed it up, one can use pandarallel or dask, but for this collection, this process will take less than 5 minutes.
map_df = map_df.sample(80) # As this is only an example, we will only pick 80 countries randomly
map_df["html_to_display"] = map_df.apply(getHTMLCombo, axis=1, save_locally=False)
Processing...Pakistan
Processing...Northern Mariana Islands
Processing...Italy
Processing...Botswana
Processing...Sao Tome and Principe
Processing...Andorra
Processing...Bosnia and Herzegovina
Processing...Taiwan
Processing...Cape Verde
Processing...Georgia
Processing...Chad
Processing...Colombia
Processing...Liberia
Processing...Bulgaria
Processing...Saint Kitts and Nevis
Processing...San Marino
Processing...Turkmenistan
Processing...Mauritius
Processing...Bangladesh
Processing...Malta
Processing...Ethiopia
Processing...India
Processing...Hungary
Processing...Bahrain
Processing...Finland
Processing...Tonga
Processing...United States
Processing...Palau
Processing...Croatia
Processing...Papua New Guinea
Processing...Portugal
Processing...Gibraltar
Processing...Guinea
Processing...Luxembourg
Processing...Cyprus
Processing...Jamaica
Processing...Sri Lanka
Processing...Falkland Islands
Processing...Albania
Processing...Mozambique
Processing...Germany
Processing...Kyrgyzstan
Processing...Qatar
Processing...Russia
Processing...Montserrat
Processing...Haiti
Processing...Guam
Processing...Slovenia
Processing...Guadeloupe
Processing...Norfolk Island
Processing...Nigeria
Processing...Bolivia
Processing...Zimbabwe
Processing...Malawi
Processing...Samoa
Processing...Uganda
Processing...Curaçao
Processing...Barbados
Processing...Switzerland
Processing...Turkey
Processing...St. Lucia
Processing...Ecuador
Processing...Cayman Islands
Processing...Spain
Processing...Oman
Processing...Liechtenstein
Processing...Solomon Islands
Processing...France
Processing...Australia
Processing...Puerto Rico
Processing...Burkina Faso
Processing...Aruba
Processing...Timor-Leste
Processing...Saint Martin
Processing...Fiji
Processing...Mexico
Processing...Sierra Leone
Processing...Greece
Processing...Martinique
Processing...Seychelles
Warning
Note that, while we used save_locally=False
in this example, we highly recommend using save_locally=True
for real-life applications.
With save_locally=True
, all generated images will be saved in a separated folder, making the HTML considerably lighter. To correctly render the map elsewhere, you will also need to copy the figure folder.
We opted for save_locally=False
here to render the map in this book correctly without the dependency of the figure folder to make sure that all images are displayed correctly.```
We remove the empty geometries and fill potentially null values in the dataset with the followng commands:
map_df = map_df[~map_df["geometry"].apply(lambda x: x is None or x.is_empty)]
map_df = map_df.fillna(0.0)
The code below allows us to create different colormaps for different analyses. This is up to the users to define the most interesting thing to show on the map.
# Colormap 1:
colormaps = {}
for infra in ["Wifi", "2G", "3G", "4G"]:
alldevices = map_df['both_18-_Wifi'] + map_df['both_18-_2G'] + map_df['both_18-_3G'] + map_df['both_18-_4G']
map_df["%" + infra] = (map_df['both_18-_%s' % (infra)] / (alldevices)).fillna(0)
colormaps["%" + infra] = branca.colormap.LinearColormap(
vmin=0,
vmax=1.0,
colors=['red', 'orange', 'white', 'green', 'darkgreen'],
caption="%" + infra,
)
Last, we plot the map below:
m = folium.Map(location=[0, 0], zoom_start=2,
tiles="openstreetmap",
overlay=False,
show=True, name="Color Mode")
fg = folium.FeatureGroup(name='groups', control=False).add_to(m)
grp1 = plugins.FeatureGroupSubGroup(fg, '%Wifi', overlay=True, show=True).add_to(m)
grps = [grp1] #, grp2, grp3, grp4, grp5]
# Other options are displaying the map according to 2G, 3G, 4G usage
for i, element in enumerate(["%Wifi"]): #, "%2G", "%3G", "%4G"]):
popup = GeoJsonPopup(
labels= False, fields = ["html_to_display"],
localize=True,
style="background-color: yellow",
sticky=False,
min_width=700,
max_width=700,
)
tooltip = GeoJsonTooltip(
fields=["name", "both_18-_AllDevices", "both_18-_Wifi", "both_18-_2G", "both_18-_3G", "both_18-_4G",],
aliases=["State:", "All Population", "Wifi", "2G", "3G", "4G"],
localize=True,
sticky=False,
labels=True,
style="""
background-color: #F0EFEF;
border: 2px solid black;
border-radius: 3px;
box-shadow: 3px;
""",
max_width=800,
)
my_choropleth = folium.Choropleth(
geo_data=map_df,
name='choropleth',
data=map_df[["name", '%Wifi']],
columns=['name', '%Wifi'],
key_on='feature.properties.name',
fill_color='YlGn',
nan_fill_color="black",
fill_opacity=0.7,
line_opacity=0.2,
highlight=True,
line_color='black').geojson.add_to(grps[i])
view1 = folium.GeoJson(
map_df,
style_function=lambda x: {
"fillColor": colormaps[element](x["properties"][element])
if x["properties"][element] is not None else "transparent",
"color": "black",
"fillOpacity": 0.4,
},
highlight_function = lambda x: {'weight': 6},
tooltip=tooltip,
popup=popup
).add_to(my_choropleth)
colormap = colormaps["%Wifi"]
colormap.caption = '% Wifi'
colormap.add_to(m)
folium.LayerControl(collapsed=False).add_to(m)
plugins.Fullscreen(
position='bottomright',
title='Expand me',
title_cancel='Exit me',
force_separate_button=True
).add_to(m)
m