9. A complete example: extracting data from the whole world

This notebook covers all the steps involved in a typical data collection using Facebook Marketing API data:

  1. Before the collection starts: we start by the very beginning by acquiring the FB Ids and shapefiles for the locations we are interested in.

  2. Data Collection: This step is where the primary data collection happens and might take weeks to finish.

  3. Post-processing The Collection: After the data collection happened, we need to post-process the data to create a clean CSV file for data analysis.

  4. Maps: We finally plot a map with the collection outcome.

9.1. Before the collection starts

We load here all necessary libraries used in this script. Feel free to comment out some of them if you are interested in only a part of this script.

import os
import io
import base64
import shutil
import pandas as pd
from pysocialwatcher.utils import double_country_conversion
from pysocialwatcher import watcherAPI
from pysocialwatcher.json_builder import JSONBuilder, AgeList, Age, Genders, get_predefined_behavior, LocationList
from pysocialwatcher import post_process
import json

# For the map
import matplotlib.pyplot as plt
import folium
from folium import plugins
from folium.features import GeoJson, GeoJsonTooltip, GeoJsonPopup
from branca.colormap import linear
import numpy as np
import branca
import uuid
import geopandas as gpd
from shapely import wkt

We start by using pySocialWatcher to download all shapefiles and country codes from a given list of locations.

# Load pySocialWatcher and the credentials from a file
watcher = watcherAPI(api_version="9.0", sleep_time=5)
watcher.load_credentials_file("credentials.csv")

country_list = ['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda', 'Albania', 'Armenia', 'Angola', 'Argentina', 'American Samoa',
                'Austria', 'Australia', 'Aruba', 'Azerbaijan', 'Bosnia and Herzegovina', 'Barbados', 'Bangladesh', 'Belgium', 'Burkina Faso', 'Bulgaria',
                'Bahrain', 'Burundi', 'Benin', 'Bermuda', 'Brunei', 'Bolivia', 'Brazil', 'Bahamas', 'Bhutan', 'Botswana', 'Belarus', 'Belize', 'Canada',
                'Congo Dem. Rep.', 'Central African Republic', 'Congo Rep.', 'Switzerland', "Cote d'Ivoire", 'Cook Islands', 'Chile', 'Cameroon', 'China',
                'Colombia', 'Costa Rica', 'Cape Verde', 'Curacao', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Denmark', 'Dominica', 'Dominican Republic',
                'Algeria', 'Ecuador', 'Estonia', 'Egypt', 'Western Sahara', 'Eritrea', 'Spain', 'Ethiopia', 'Finland', 'Fiji', 'Falkland Islands', 'Micronesia',
                'Faroe Islands', 'France', 'Gabon', 'United Kingdom', 'Grenada', 'Georgia', 'French Guiana', 'Guernsey', 'Ghana', 'Gibraltar', 'Greenland', 'Gambia',
                'Guinea-Bissau', 'Guadeloupe', 'Equatorial Guinea', 'Greece', 'Guatemala', 'Guam', 'Guinea', 'Guyana', 'Hong Kong', 'Honduras', 'Croatia', 'Haiti',
                'Hungary', 'Indonesia', 'Ireland', 'Israel', 'Isle of Man', 'India', 'Iraq', 'Iceland', 'Italy', 'Jersey', 'Jamaica', 'Jordan', 'Japan', 'Kenya',
                'Kyrgyzstan', 'Cambodia', 'Kiribati', 'Comoros', 'Saint Kitts and Nevis', 'South Korea', 'Kuwait', 'Cayman Islands', 'Kazakhstan', 'Laos', 'Lebanon',
                'Saint Lucia', 'Liechtenstein', 'Sri Lanka', 'Liberia', 'Lesotho', 'Lithuania', 'Luxembourg', 'Latvia', 'Libya', 'Morocco', 'Monaco', 'Moldova',
                'Montenegro', 'Saint Martin', 'Madagascar', 'Marshall Islands', 'Macedonia', 'Mali', 'Myanmar', 'Mongolia', 'Macau', 'Northern Mariana Islands',
                'Martinique', 'Mauritania', 'Montserrat', 'Malta', 'Mauritius', 'Maldives', 'Malawi', 'Mexico', 'Malaysia', 'Mozambique', 'Namibia', 'New Caledonia',
                'Niger', 'Norfolk Island', 'Nigeria', 'Nicaragua', 'Netherlands', 'Norway', 'Nepal', 'Nauru', 'Niue', 'New Zealand', 'Oman', 'Panama', 'Peru',
                'French Polynesia', 'Papua New Guinea', 'Philippines', 'Pakistan', 'Poland', 'Saint Pierre and Miquelon', 'Pitcairn', 'Puerto Rico', 'Palestine',
                'Portugal', 'Palau', 'Paraguay', 'Qatar', 'Reunion', 'Romania', 'Serbia', 'Russia', 'Rwanda', 'Saudi Arabia', 'Solomon Islands', 'Seychelles', 'Sweden',
                'Singapore', 'Saint Helena', 'Slovenia', 'Svalbard and Jan Mayen', 'Slovakia', 'Sierra Leone', 'San Marino', 'Senegal', 'Somalia', 'Suriname', 'South Sudan',
                'Sao Tome and Principe', 'El Salvador', 'Sint Maarten', 'Swaziland', 'Turks and Caicos Islands', 'Chad', 'Togo', 'Thailand', 'Tajikistan', 'Tokelau',
                'Timor-Leste', 'Turkmenistan', 'Tunisia', 'Tonga', 'Turkey', 'Trinidad and Tobago', 'Tuvalu', 'Taiwan', 'Tanzania', 'Ukraine', 'Uganda', 'United States',
                'Uruguay', 'Uzbekistan', 'Saint Vincent and the Grenadines', 'Venezuela', 'British Virgin Islands', 'US Virgin Islands', 'Vietnam', 'Vanuatu',
                'Wallis and Futuna', 'Samoa', 'Kosovo', 'Yemen', 'Mayotte', 'South Africa', 'Zambia', 'Zimbabwe']

df_tmp = []
for country in country_list:
    
    # Get the 2-letters code for a given country: United States -> US
    country_code = double_country_conversion(country)

    print("Downloading %s (%s)" % (country, country_code))
    df_country = watcherAPI.get_KML_given_geolocation("countries", [country_code])
    df_tmp.append(df_country)
    
df = pd.concat(df_tmp)
df.head(3)
Downloading Andorra (AD)
Downloading United Arab Emirates (AE)
Downloading Afghanistan (AF)
Downloading Antigua and Barbuda (AG)
Downloading Albania (AL)
Downloading Armenia (AM)
Downloading Angola (AO)
Downloading Argentina (AR)
Downloading American Samoa (AS)
Downloading Austria (AT)
Downloading Australia (AU)
Downloading Aruba (AW)
Downloading Azerbaijan (AZ)
Downloading Bosnia and Herzegovina (BA)
Downloading Barbados (BB)
Downloading Bangladesh (BD)
Downloading Belgium (BE)
Downloading Burkina Faso (BF)
Downloading Bulgaria (BG)
Downloading Bahrain (BH)
Downloading Burundi (BI)
Downloading Benin (BJ)
Downloading Bermuda (BM)
Downloading Brunei (BN)
Downloading Bolivia (BO)
Downloading Brazil (BR)
Downloading Bahamas (BS)
Downloading Bhutan (BT)
Downloading Botswana (BW)
Downloading Belarus (BY)
Downloading Belize (BZ)
Downloading Canada (CA)
Downloading Congo Dem. Rep. (CD)
Downloading Central African Republic (CF)
Downloading Congo Rep. (CG)
Downloading Switzerland (CH)
Downloading Cote d'Ivoire (CI)
Downloading Cook Islands (CK)
Downloading Chile (CL)
Downloading Cameroon (CM)
Downloading China (CN)
Downloading Colombia (CO)
Downloading Costa Rica (CR)
Downloading Cape Verde (CV)
Downloading Curacao (CW)
Downloading Cyprus (CY)
Downloading Czech Republic (CZ)
Downloading Germany (DE)
Downloading Djibouti (DJ)
Downloading Denmark (DK)
Downloading Dominica (DM)
Downloading Dominican Republic (DO)
Downloading Algeria (DZ)
Downloading Ecuador (EC)
Downloading Estonia (EE)
Downloading Egypt (EG)
Downloading Western Sahara (EH)
Downloading Eritrea (ER)
Downloading Spain (ES)
Downloading Ethiopia (ET)
Downloading Finland (FI)
Downloading Fiji (FJ)
Downloading Falkland Islands (FK)
Downloading Micronesia (FM)
Downloading Faroe Islands (FO)
Downloading France (FR)
Downloading Gabon (GA)
Downloading United Kingdom (GB)
Downloading Grenada (GD)
Downloading Georgia (GE)
Downloading French Guiana (GF)
Downloading Guernsey (GG)
Downloading Ghana (GH)
Downloading Gibraltar (GI)
Downloading Greenland (GL)
Downloading Gambia (GM)
Downloading Guinea-Bissau (GN)
Downloading Guadeloupe (GP)
Downloading Equatorial Guinea (GQ)
Downloading Greece (GR)
Downloading Guatemala (GT)
Downloading Guam (GU)
Downloading Guinea (GW)
Downloading Guyana (GY)
Downloading Hong Kong (HK)
Downloading Honduras (HN)
Downloading Croatia (HR)
Downloading Haiti (HT)
Downloading Hungary (HU)
Downloading Indonesia (ID)
Downloading Ireland (IE)
Downloading Israel (IL)
Downloading Isle of Man (IM)
Downloading India (IN)
Downloading Iraq (IQ)
Downloading Iceland (IS)
Downloading Italy (IT)
Downloading Jersey (JE)
Downloading Jamaica (JM)
Downloading Jordan (JO)
Downloading Japan (JP)
Downloading Kenya (KE)
Downloading Kyrgyzstan (KG)
Downloading Cambodia (KH)
Downloading Kiribati (KI)
Downloading Comoros (KM)
Downloading Saint Kitts and Nevis (KN)
Downloading South Korea (KR)
Downloading Kuwait (KW)
Downloading Cayman Islands (KY)
Downloading Kazakhstan (KZ)
Downloading Laos (LA)
Downloading Lebanon (LB)
Downloading Saint Lucia (LC)
Downloading Liechtenstein (LI)
Downloading Sri Lanka (LK)
Downloading Liberia (LR)
Downloading Lesotho (LS)
Downloading Lithuania (LT)
Downloading Luxembourg (LU)
Downloading Latvia (LV)
Downloading Libya (LY)
Downloading Morocco (MA)
Downloading Monaco (MC)
Downloading Moldova (MD)
Downloading Montenegro (ME)
Downloading Saint Martin (MF)
Downloading Madagascar (MG)
Downloading Marshall Islands (MH)
Downloading Macedonia (MK)
Downloading Mali (ML)
Downloading Myanmar (MM)
Downloading Mongolia (MN)
Downloading Macau (MO)
Downloading Northern Mariana Islands (MP)
Downloading Martinique (MQ)
Downloading Mauritania (MR)
Downloading Montserrat (MS)
Downloading Malta (MT)
Downloading Mauritius (MU)
Downloading Maldives (MV)
Downloading Malawi (MW)
Downloading Mexico (MX)
Downloading Malaysia (MY)
Downloading Mozambique (MZ)
Downloading Namibia (NA)
Downloading New Caledonia (NC)
Downloading Niger (NE)
Downloading Norfolk Island (NF)
Downloading Nigeria (NG)
Downloading Nicaragua (NI)
Downloading Netherlands (NL)
Downloading Norway (NO)
Downloading Nepal (NP)
Downloading Nauru (NR)
Downloading Niue (NU)
Downloading New Zealand (NZ)
Downloading Oman (OM)
Downloading Panama (PA)
Downloading Peru (PE)
Downloading French Polynesia (PF)
Downloading Papua New Guinea (PG)
Downloading Philippines (PH)
Downloading Pakistan (PK)
Downloading Poland (PL)
Downloading Saint Pierre and Miquelon (PM)
Downloading Pitcairn (PN)
Downloading Puerto Rico (PR)
Downloading Palestine (PS)
Downloading Portugal (PT)
Downloading Palau (PW)
Downloading Paraguay (PY)
Downloading Qatar (QA)
Downloading Reunion (RE)
Downloading Romania (RO)
Downloading Serbia (RS)
Downloading Russia (RU)
Downloading Rwanda (RW)
Downloading Saudi Arabia (SA)
Downloading Solomon Islands (SB)
Downloading Seychelles (SC)
Downloading Sweden (SE)
Downloading Singapore (SG)
Downloading Saint Helena (SH)
Downloading Slovenia (SI)
Downloading Svalbard and Jan Mayen (SJ)
Downloading Slovakia (SK)
Downloading Sierra Leone (SL)
Downloading San Marino (SM)
Downloading Senegal (SN)
Downloading Somalia (SO)
Downloading Suriname (SR)
Downloading South Sudan (SS)
Downloading Sao Tome and Principe (ST)
Downloading El Salvador (SV)
Downloading Sint Maarten (SX)
Downloading Swaziland (SZ)
Downloading Turks and Caicos Islands (TC)
Downloading Chad (TD)
Downloading Togo (TG)
Downloading Thailand (TH)
Downloading Tajikistan (TJ)
Downloading Tokelau (TK)
Downloading Timor-Leste (TL)
Downloading Turkmenistan (TM)
Downloading Tunisia (TN)
Downloading Tonga (TO)
Downloading Turkey (TR)
Downloading Trinidad and Tobago (TT)
Downloading Tuvalu (TV)
Downloading Taiwan (TW)
Downloading Tanzania (TZ)
Downloading Ukraine (UA)
Downloading Uganda (UG)
Downloading United States (US)
Downloading Uruguay (UY)
Downloading Uzbekistan (UZ)
Downloading Saint Vincent and the Grenadines (VC)
Downloading Venezuela (VE)
Downloading British Virgin Islands (VG)
Downloading US Virgin Islands (VI)
Downloading Vietnam (VN)
Downloading Vanuatu (VU)
Downloading Wallis and Futuna (WF)
Downloading Samoa (WS)
Downloading Kosovo (XK)
Downloading Yemen (YE)
Downloading Mayotte (YT)
Downloading South Africa (ZA)
Downloading Zambia (ZM)
Downloading Zimbabwe (ZW)
name kml key
0 Andorra <Polygon><outerBoundaryIs><LinearRing><coordin... AD
0 United Arab Emirates <Polygon><outerBoundaryIs><LinearRing><coordin... AE
0 Afghanistan <Polygon><outerBoundaryIs><LinearRing><coordin... AF

We take advantage of JSONBuilder here to genreate a JSON file for the data collection. Details can be found in our previous notebook.

# Location list given a dataframe
loclist = LocationList()
loclist.get_location_list_from_df(df)

# Age groups
ageList = AgeList()
ageList.add(Age(18, None))
ageList.add(Age(18, 24))
ageList.add(Age(25, 34))
ageList.add(Age(35, 54))
ageList.add(Age(55, None))

# Gender
genders = Genders(male=True, female=True, combined=True)

# Using the pre-defined behavior option of connectivity (which collects #users using Wifi, 2G, 3G, 4G)
connetivity = get_predefined_behavior(option="connectivity")

# Create the jsonbuilder object using what was defined before:
jsonb = JSONBuilder(name="LargeWorldCollection", 
                    age_list=ageList, 
                    location_list=loclist,
                    genders=genders, 
                    behavior_groups=connetivity)

# Save json file for collection:
json_outputname = "countries_whole_world.json"
jsonb.jsonfy(json_outputname)

df["country_code"] = df["key"]

# Save geojson file to use it with Folium later on.    
geojson_outname = "countries_whole_world.geojson"
watcherAPI.transform_KML_into_geojson(df, geojson_outname)

print("Created file %s" % geojson_outname)
Created file countries_whole_world.json.
Created file countries_whole_world.geojson

9.1.1. Simplifying the shapefiles/KMLs/geojsons:

Most of the time, simplifying the shapefiles makes the HTML created by Folium faster without losing precision. That happens because the KML/Geojson from Facebook might be too big to plot using Folium.

A straightforward way to still use it successfully is by simplifying the shapefile/KML/Geojson using the command line ogr2ogr from the GDAL package.

See https://gdal.org/download.html with instructions on how to download it for your OS.

If you have installed it on your Linux system, the following command can help to decrease the size of the shapefile:

if shutil.which("ogr2ogr") is not None: # check if this command is available first
    oldname = geojson_outname
    # if oldname is OLDNAME.extension, newname is OLDNAME_simplified.extension 
    newname = geojson_outname.rsplit('.')[0] + "_simplified." + geojson_outname.rsplit('.')[1]
    
    command = "ogr2ogr -f geojson -simplify 0.01 %s %s" % (newname, oldname)
    print("\033[1mRunning command:\033[0m\n%s" % command)
    os.system(command)

    print ('* Size of old file: {:,.1f}'.format(os.path.getsize(oldname)/float(1<<20))+" MB")    
    print ('* Size of simplified file: {:,.1f}'.format(os.path.getsize(newname)/float(1<<20))+" MB")
Running command:
ogr2ogr -f geojson -simplify 0.01 countries_whole_world_simplified.geojson countries_whole_world.geojson
* Size of old file: 61.6 MB
* Size of simplified file: 9.7 MB

9.2. Collecting data

This is a very large data collection with 17,325 API calls. There are several ways to speed it up:

  1. Increasing the number of users linked to an APP

  2. Getting a business token

  3. Or simply by using multiple users/tokens at the same time: for this you just need to have multiple rows in the credentials.csv file

# This cell performs the collection. It might take several days.
watcher = watcherAPI(api_version="9.0", sleep_time=7, outputname="world_collection_psw.csv.gz")
df = watcher.run_data_collection("countries_whole_world.json", remove_tmp_files=True)
# Omitted for brevity

9.3. Post-processing Steps

Once the data collection is done, we can simply post-process it with the following commands:

df = pd.read_csv("./world_collection_psw.csv.gz")
df.head(5)
Unnamed: 0 name interests ages_ranges genders behavior scholarities languages family_statuses relationship_statuses ... household_composition all_fields targeting response dau_audience mau_audience access_device timestamp publisher_platforms mock_response
0 0 LargeWorldCollection NaN {'min': 18} 0 NaN NaN NaN NaN NaN ... NaN (('ages_ranges', {'min': 18}), ('genders', 0),... {'geo_locations': {'countries': ['AD'], 'locat... b'{"data":[{"daily_outcomes_curve":[{"spend":0... 0 1000 {'name': '2G', 'or': [6017253486583]} 1612206421 ["facebook"] False
1 1 LargeWorldCollection NaN {'min': 18} 0 NaN NaN NaN NaN NaN ... NaN (('ages_ranges', {'min': 18}), ('genders', 0),... {'geo_locations': {'countries': ['AE'], 'locat... b'{"data":[{"daily_outcomes_curve":[{"spend":0... 1124 2500 {'name': '2G', 'or': [6017253486583]} 1612206421 ["facebook"] False
2 2 LargeWorldCollection NaN {'min': 18} 0 NaN NaN NaN NaN NaN ... NaN (('ages_ranges', {'min': 18}), ('genders', 0),... {'geo_locations': {'countries': ['AF'], 'locat... b'{"data":[{"daily_outcomes_curve":[{"spend":0... 36418 120000 {'name': '2G', 'or': [6017253486583]} 1612206421 ["facebook"] False
3 3 LargeWorldCollection NaN {'min': 18} 0 NaN NaN NaN NaN NaN ... NaN (('ages_ranges', {'min': 18}), ('genders', 0),... {'geo_locations': {'countries': ['AG'], 'locat... b'{"data":[{"daily_outcomes_curve":[{"spend":0... 994 1000 {'name': '2G', 'or': [6017253486583]} 1612206421 ["facebook"] False
4 4 LargeWorldCollection NaN {'min': 18} 0 NaN NaN NaN NaN NaN ... NaN (('ages_ranges', {'min': 18}), ('genders', 0),... {'geo_locations': {'countries': ['AL'], 'locat... b'{"data":[{"daily_outcomes_curve":[{"spend":0... 399 1000 {'name': '2G', 'or': [6017253486583]} 1612206421 ["facebook"] False

5 rows × 21 columns

processed_df = post_process.post_process_df_collection(df)
processed_df.head(4)[["LocationType", "FullLocation", "Gender", "Ages", "Education"]]
LocationType FullLocation Gender Ages Education
0 country AD both 18- AllDegrees
1 country AE both 18- AllDegrees
2 country AF both 18- AllDegrees
3 country AG both 18- AllDegrees
cols_to_combine = ["Gender", "Ages", "Device"]
combo_df = post_process.combine_cols(processed_df, cols_to_combine)

combo_df = combo_df.pivot(index="Key", columns="combo", values="mau_audience").reset_index()
combo_df.head()
combo Key both_18-24_2G both_18-24_3G both_18-24_4G both_18-24_AllDevices both_18-24_Wifi both_18-_2G both_18-_3G both_18-_4G both_18-_AllDevices ... male_35-54_2G male_35-54_3G male_35-54_4G male_35-54_AllDevices male_35-54_Wifi male_55-_2G male_55-_3G male_55-_4G male_55-_AllDevices male_55-_Wifi
0 AD 1000 1000 1000 2900 1500 1000 1000 1000 39000 ... 1000 1000 1000 8900 5300 1000 1000 1000 3900 2200
1 AE 1000 13000 140000 1100000 700000 2500 79000 960000 7800000 ... 1000 18000 270000 1900000 1200000 1000 2100 24000 200000 130000
2 AF 47000 910000 120000 1500000 69000 120000 2300000 320000 3900000 ... 6300 180000 34000 330000 35000 8800 80000 8800 140000 9600
3 AG 1000 1000 1800 9300 4300 1000 4000 12000 55000 ... 1000 1000 2200 9300 4100 1000 1000 1000 3400 1700
4 AL 1000 4000 22000 240000 120000 1000 24000 120000 1200000 ... 1000 5000 24000 240000 140000 1000 3400 12000 110000 71000

5 rows × 76 columns

9.4. Maps

Last, we use Folium to create a visualization.

The first thing that we do is loading the simplified geojson file that we created at the beginning of this notebook.

data = json.load(open("./countries_whole_world_simplified.geojson")) 
kml_df = gpd.GeoDataFrame.from_features(data, crs='EPSG:4326')
kml_df = kml_df.rename(columns={"key": "Key"})

# :::important::: you need to merge like below. If you try to use pd.merge(df1, df2), it wont work!
map_df = kml_df.merge(combo_df)
map_df.head(2)
geometry name Key country both_18-24_2G both_18-24_3G both_18-24_4G both_18-24_AllDevices both_18-24_Wifi both_18-_2G ... male_35-54_2G male_35-54_3G male_35-54_4G male_35-54_AllDevices male_35-54_Wifi male_55-_2G male_55-_3G male_55-_4G male_55-_AllDevices male_55-_Wifi
0 POLYGON ((1.40970 42.48709, 1.44137 42.47546, ... Andorra AD Andorra 1000 1000 1000 2900 1500 1000 ... 1000 1000 1000 8900 5300 1000 1000 1000 3900 2200
1 POLYGON ((56.37298 24.98056, 56.36980 24.97986... United Arab Emirates AE United Arab Emirates 1000 13000 140000 1100000 700000 2500 ... 1000 18000 270000 1900000 1200000 1000 2100 24000 200000 130000

2 rows × 79 columns

The next cells will load the files needed to process the HTML pop-up on the map. Note that we are saving the plots into a folder called figures. This largely helps to create a lighter HTML source code which will be much faster to load later on. The drawback is that we will need to move the folder figures together with the HTML file created by Folium.

def getHTMLCombo(row, save_locally=False, folder="figures/"):
    
    print("Processing..." + row["name"])
    
    fb_18_24 = row["male_18-24_AllDevices"] + row["female_18-24_AllDevices"]
    fb_25_34 = row["male_25-34_AllDevices"] + row["female_25-34_AllDevices"]
    fb_35_54 = row["male_35-54_AllDevices"] + row["female_35-54_AllDevices"]
    fb_55_ = row["male_55-_AllDevices"] + row["female_55-_AllDevices"]
    
    total_pop_fb = float(row['both_18-_AllDevices'])
    
    pie_fb_gender = getPie(["Male", "Female"], [row["male_18-_AllDevices"], row["female_18-_AllDevices"]], title="Genders (FB)", save_locally=save_locally, folder=folder)
    
    pie_connectivity = getPie(["Wifi", "2G", "3G", "4G"], [row["both_18-_Wifi"], row["both_18-_2G"], 
                                                           row["both_18-_3G"], row["both_18-_4G"],], title="Connectivity (All)", save_locally=save_locally, folder=folder)
    pie_connectivity_male = getPie(["Wifi", "2G", "3G", "4G"], [row["male_18-_Wifi"], row["male_18-_2G"],
                                                                row["male_18-_3G"], row["male_18-_4G"]], 
                                   title="Connectivity (Male)", save_locally=save_locally, folder=folder)
    pie_connectivity_female = getPie(["Wifi", "2G", "3G", "4G"], [row["female_18-_Wifi"], row["female_18-_2G"],
                                                                  row["female_18-_3G"], row["female_18-_4G"],], 
                                     title="Connectivity (Female)", save_locally=save_locally, folder=folder)
    pie_connectivity_18_24 = getPie(["Wifi", "2G", "3G", "4G"], [row["both_18-24_Wifi"], row["both_18-24_2G"],
                                                                row["both_18-24_3G"], row["both_18-24_4G"]],
                                    title="Connectivity (18-24)", save_locally=save_locally, folder=folder)
    pie_connectivity_25_34 = getPie(["Wifi", "2G", "3G", "4G"], [row["both_25-34_Wifi"], row["both_25-34_2G"],
                                                                row["both_25-34_3G"], row["both_25-34_4G"]],
                                   title="Connectivity (25-34)", save_locally=save_locally, folder=folder)
    pie_connectivity_35_54 = getPie(["Wifi", "2G", "3G", "4G"], [row["both_35-54_Wifi"], row["both_35-54_2G"],
                                                                row["both_35-54_3G"], row["both_35-54_4G"]], 
                                   title="Connectivity (35-54)", save_locally=save_locally, folder=folder)
    pie_connectivity_55_ = getPie(["Wifi", "2G", "3G", "4G"], [row["both_55-_Wifi"], row["both_55-_2G"],
                                                                row["both_55-_3G"], row["both_55-_4G"]], 
                                   title="Connectivity (55+)", save_locally=save_locally, folder=folder)
    
    
    pie_fb_age = getPie(["18-24", "25-34", "35-54", "55+"], [row["both_18-_AllDevices"], row["both_25-34_AllDevices"],
                                                          row["both_35-54_AllDevices"], row["both_55-_AllDevices"]], title="Ages (FB)", save_locally=save_locally, folder=folder)
    
    pyramid_age = getPyramid(["18-24", "25-34", "35-54", "55+"], ["female", "male"],
                             [row["female_18-24_AllDevices"], row["female_25-34_AllDevices"], row["female_35-54_AllDevices"], row["female_55-_AllDevices"]],
                             [row["male_18-24_AllDevices"], row["male_25-34_AllDevices"], row["male_35-54_AllDevices"], row["male_55-_AllDevices"]], 
                             save_locally=save_locally, folder=folder)
    pyramid_age_perc = getPyramid(["18-24", "25-34", "35-54", "55+"], ["female", "male"],
                                  [row["female_18-24_AllDevices"], row["female_25-34_AllDevices"], row["female_35-54_AllDevices"], row["female_55-_AllDevices"]],
                                  [row["male_18-24_AllDevices"], row["male_25-34_AllDevices"], row["male_35-54_AllDevices"], row["male_55-_AllDevices"]],
                                  normalized=True, save_locally=save_locally, folder=folder)
    
    #({lat:.1f}, {lng:.1f}) </h3> </br>  
    html = """
    <h3> <b> Location: </b> <i> {name} </i>  
    <h5> <b>Total #Users (FB): </b> {total_pop_fb:,} </br> </h5>
    
    <h5> <b> Gender Distribution </b> </h5>
    <center>
        <img src='{pie_fb_gender}'/>
    </center>
    
    <h5> <b> Age Distribution </b> </h5>
    <center>
        <img src='{pie_fb_age}'/>
    </center>
    
    <h5> <b> Age Distribution per Gender</b> </h5>
    <center>
        <img src='{pyramid_age}'/>
        <img src='{pyramid_age_perc}'/>
    </center>
    
    <h5> <b> Connectivity according to FB </b> </h5>
    <center>
        <img src='{pie_connectivity}'/>
        <img src='{pie_connectivity_male}'/>
        <img src='{pie_connectivity_female}'/>
    </center>
    
    <h5> <b> Connectivity by Age </b> </h5>
    <center>
        <img src='{pie_connectivity_18_24}'/>
        <img src='{pie_connectivity_25_34}'/>
        <img src='{pie_connectivity_35_54}'/>
        <img src='{pie_connectivity_55_}'/>
    </center>
    
    """.format(name=row["name"].encode('raw_unicode_escape'),
               #lat=float(row["lat"]), lng=float(row["lng"]), 
               total_pop_fb=total_pop_fb,
               pie_fb_gender=pie_fb_gender,
               pie_fb_age=pie_fb_age,
               pyramid_age=pyramid_age,
               pyramid_age_perc=pyramid_age_perc,
               pie_connectivity=pie_connectivity,
               pie_connectivity_male=pie_connectivity_male,
               pie_connectivity_female=pie_connectivity_female,
               pie_connectivity_18_24=pie_connectivity_18_24,
               pie_connectivity_25_34=pie_connectivity_25_34,
               pie_connectivity_35_54=pie_connectivity_35_54,
               pie_connectivity_55_=pie_connectivity_55_,
    )
    
    #iframe = branca.element.IFrame(html=html, width=500, height=300)
    
    # TRY TO FORMAT WITH https://docs.python.org/3/library/string.html#format-specification-mini-language
    return html
    #return iframe

def getPie(labels, sizes, explode=None, title=None, save_locally=True, folder=None):
    
    # Pie chart, where the slices will be ordered and plotted counter-clockwise:
    # Examples:
    # labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
    # sizes = [15, 30, 45, 10]
    # explode = (0, 0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

    
    def label_formant(pct, allvals):
        absolute = int(pct/100.*np.sum(allvals))
        return "{:.1f}%\n({:,d})".format(pct, absolute)
    
    fig1, ax1 = plt.subplots(figsize=(2,2))
    ax1.pie(sizes, explode=explode, labels=labels,
            autopct=lambda pct: label_formant(pct, sizes),
            shadow=True, startangle=90, counterclock=False, 
            wedgeprops = {'linewidth' : 2, 'edgecolor': "black"}
           )
    
    if title:
        ax1.set_title(title)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    if not save_locally:
        img_buffer = io.BytesIO()
        plt.savefig(img_buffer, format='png', transparent=True, bbox_inches="tight")
        img_buffer.seek(0)
        plt.close()
        return "data:image/png;base64,%s" % base64.b64encode(img_buffer.getvalue()).decode('UTF-8')
    
    else:
        random_name = unique_filename = str(uuid.uuid4())
        if folder:
            path_name = os.path.join(folder, random_name)
        else:
            path_name = random_name
        
        plt.savefig(path_name, format='png', transparent=True, bbox_inches="tight")
        plt.close()
        return path_name

def getPyramid(y, labels, data_left, data_right, normalized=False, save_locally=True, folder=None):
    
    # E.g.:
    # y = [0-18, 19-25, 26+]
    # labels = [female, male]
    # data_left = [1000, 2000, 3000]
    # data_right = [2000, 5000, 1000]
    
    data_left = np.array(data_left)
    data_right = np.array(data_right)
    
    if normalized:
        data_left = data_left / data_left.sum()
        data_right = data_right / data_right.sum()
    
    assert data_left.shape == data_right.shape
    N = range(0, data_left.shape[0])
    
    fig1, ax1 = plt.subplots(figsize=(2,2))
    
    ax1.barh(N, -data_left, label=labels[0])
    ax1.barh(N, data_right, label=labels[1])
    
    ax1.set(yticks=N, yticklabels=y)
    
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(True)
    ax1.spines["top"]._linewidth = 2
    
    ax1.spines['left'].set_visible(False)
    ax1.spines['bottom'].set_visible(True)
    ax1.spines['bottom']._linewidth = 2
    
    ax1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
          fancybox=True, shadow=True, ncol=5)

    if not save_locally:
        img_buffer = io.BytesIO()
        plt.savefig(img_buffer, format='png', transparent=True, bbox_inches="tight")
        img_buffer.seek(0)
        plt.close()
        return "data:image/png;base64,%s" % base64.b64encode(img_buffer.getvalue()).decode('UTF-8')
        
    else:
        random_name = unique_filename = str(uuid.uuid4())
        if folder:
            path_name = os.path.join(folder, random_name)
        else:
            path_name = random_name
        
        plt.savefig(path_name, format='png', transparent=True, bbox_inches="tight")
        plt.close()
        return path_name

We then apply the getHTMLCombo function to each row in the dataframe.

To speed it up, one can use pandarallel or dask, but for this collection, this process will take less than 5 minutes.

map_df = map_df.sample(80) # As this is only an example, we will only pick 80 countries randomly 
map_df["html_to_display"] = map_df.apply(getHTMLCombo, axis=1, save_locally=False)
Processing...Pakistan
Processing...Northern Mariana Islands
Processing...Italy
Processing...Botswana
Processing...Sao Tome and Principe
Processing...Andorra
Processing...Bosnia and Herzegovina
Processing...Taiwan
Processing...Cape Verde
Processing...Georgia
Processing...Chad
Processing...Colombia
Processing...Liberia
Processing...Bulgaria
Processing...Saint Kitts and Nevis
Processing...San Marino
Processing...Turkmenistan
Processing...Mauritius
Processing...Bangladesh
Processing...Malta
Processing...Ethiopia
Processing...India
Processing...Hungary
Processing...Bahrain
Processing...Finland
Processing...Tonga
Processing...United States
Processing...Palau
Processing...Croatia
Processing...Papua New Guinea
Processing...Portugal
Processing...Gibraltar
Processing...Guinea
Processing...Luxembourg
Processing...Cyprus
Processing...Jamaica
Processing...Sri Lanka
Processing...Falkland Islands
Processing...Albania
Processing...Mozambique
Processing...Germany
Processing...Kyrgyzstan
Processing...Qatar
Processing...Russia
Processing...Montserrat
Processing...Haiti
Processing...Guam
Processing...Slovenia
Processing...Guadeloupe
Processing...Norfolk Island
Processing...Nigeria
Processing...Bolivia
Processing...Zimbabwe
Processing...Malawi
Processing...Samoa
Processing...Uganda
Processing...Curaçao
Processing...Barbados
Processing...Switzerland
Processing...Turkey
Processing...St. Lucia
Processing...Ecuador
Processing...Cayman Islands
Processing...Spain
Processing...Oman
Processing...Liechtenstein
Processing...Solomon Islands
Processing...France
Processing...Australia
Processing...Puerto Rico
Processing...Burkina Faso
Processing...Aruba
Processing...Timor-Leste
Processing...Saint Martin
Processing...Fiji
Processing...Mexico
Processing...Sierra Leone
Processing...Greece
Processing...Martinique
Processing...Seychelles

Warning

Note that, while we used save_locally=False in this example, we highly recommend using save_locally=True for real-life applications. With save_locally=True, all generated images will be saved in a separated folder, making the HTML considerably lighter. To correctly render the map elsewhere, you will also need to copy the figure folder.

We opted for save_locally=False here to render the map in this book correctly without the dependency of the figure folder to make sure that all images are displayed correctly.```

We remove the empty geometries and fill potentially null values in the dataset with the followng commands:

map_df = map_df[~map_df["geometry"].apply(lambda x: x is None or x.is_empty)]
map_df = map_df.fillna(0.0)

The code below allows us to create different colormaps for different analyses. This is up to the users to define the most interesting thing to show on the map.

# Colormap 1:
colormaps = {}

for infra in ["Wifi", "2G", "3G", "4G"]:
    
    alldevices = map_df['both_18-_Wifi'] + map_df['both_18-_2G'] + map_df['both_18-_3G'] + map_df['both_18-_4G']
    map_df["%" + infra] = (map_df['both_18-_%s' % (infra)] / (alldevices)).fillna(0)
    colormaps["%" + infra] = branca.colormap.LinearColormap(
        vmin=0,
        vmax=1.0, 
        colors=['red', 'orange', 'white', 'green', 'darkgreen'],
        caption="%" + infra,
    )

Last, we plot the map below:

m = folium.Map(location=[0, 0], zoom_start=2, 
               tiles="openstreetmap", 
               overlay=False, 
               show=True, name="Color Mode")

fg = folium.FeatureGroup(name='groups', control=False).add_to(m)
grp1 = plugins.FeatureGroupSubGroup(fg, '%Wifi', overlay=True, show=True).add_to(m)

grps = [grp1] #, grp2, grp3, grp4, grp5]

# Other options are displaying the map according to 2G, 3G, 4G usage
for i, element in enumerate(["%Wifi"]): #, "%2G", "%3G", "%4G"]):
    
    popup = GeoJsonPopup(
        labels= False, fields = ["html_to_display"],
        localize=True,
        style="background-color: yellow",
        sticky=False,
        min_width=700,
        max_width=700,
    )

    tooltip = GeoJsonTooltip(
        fields=["name", "both_18-_AllDevices", "both_18-_Wifi", "both_18-_2G", "both_18-_3G", "both_18-_4G",],
        aliases=["State:", "All Population", "Wifi", "2G", "3G", "4G"],
        localize=True,
        sticky=False,
        labels=True,
        style="""
            background-color: #F0EFEF;
            border: 2px solid black;
            border-radius: 3px;
            box-shadow: 3px;
        """,
        max_width=800,
    )

    my_choropleth = folium.Choropleth(
        geo_data=map_df,
        name='choropleth',
        data=map_df[["name", '%Wifi']],
        columns=['name', '%Wifi'],
        key_on='feature.properties.name',
        fill_color='YlGn',
        nan_fill_color="black",
        fill_opacity=0.7,
        line_opacity=0.2,
        highlight=True,
        line_color='black').geojson.add_to(grps[i])
    
    
    view1 = folium.GeoJson(
        map_df,
        style_function=lambda x: {
            "fillColor": colormaps[element](x["properties"][element]) 
                if x["properties"][element] is not None else "transparent",
            "color": "black",
            "fillOpacity": 0.4,
        },
        highlight_function = lambda x: {'weight': 6},
        tooltip=tooltip,
        popup=popup
    ).add_to(my_choropleth)

colormap = colormaps["%Wifi"]
colormap.caption = '% Wifi'
colormap.add_to(m)

folium.LayerControl(collapsed=False).add_to(m)

plugins.Fullscreen(
    position='bottomright',
    title='Expand me',
    title_cancel='Exit me',
    force_separate_button=True
).add_to(m)


m
Make this Notebook Trusted to load map: File -> Trust Notebook

Finally, we can save the HTML locally or to export it to another server (remember to also move the folder figures).

m.save('connetivity_world.html')