Internet Connectivity Trends#
Economic Analysis of Network Test Performance in Bangladesh#
The analysis of network testing data in Bangladesh after reveals a sharp divergence in testing activity across regions and districts. A key factor contributing to this volatility is the government-imposed internet shutdown between July 18 and July 23, which led to a complete halt in testing during this period. This disruption likely explains the sharp declines observed in many districts, followed by abrupt increases in test counts once connectivity was restored.
Number of Internet Test Taken#
Show code cell source
Hide code cell source
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Legend, Span
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from datetime import datetime
# Load the CSV file and get top 8 regions
df = pd.read_csv('../../data/Filtered_NetworkPerformance_balanced.csv', parse_dates=['date_only'])
# Create mapping of valid districts by region
region_district_map = {}
for region in df['region'].unique():
region_data = df[df['region'] == region]
valid_districts = region_data.groupby('district')['count'].sum()
valid_districts = valid_districts[valid_districts > 0].index.tolist()
region_district_map[region] = valid_districts
# Get top 8 regions
top_regions = df['region'].unique()[:8]
filtered_df = df[df['region'].isin(top_regions)]
def get_region_plot(region_df, region_name):
p = figure(
title=f"Daily Network Performance Tests in {region_name.capitalize()}",
x_axis_type='datetime',
width=1000,
height=600
)
# Get only valid districts for this region
valid_districts = region_district_map[region_name]
districts = sorted([d for d in valid_districts if d in region_df['district'].unique()])
colors = Category20[20] if len(districts) <= 20 else [Category20[20][i % 20] for i in range(len(districts))]
for i, district in enumerate(districts):
district_data = region_df[region_df['district'] == district]
if district_data['count'].sum() > 0:
source = ColumnDataSource(district_data)
p.line(
x='date_only',
y='count',
source=source,
line_width=2,
legend_label=f"{district.capitalize()}",
color=colors[i]
)
# Add vertical lines for shutdown period
july_18 = datetime(2024, 7, 18) # Start of internet shutdown
july_23 = datetime(2024, 7, 23) # End of internet shutdown
vline_18 = Span(location=july_18, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
vline_23 = Span(location=july_23, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
p.add_layout(vline_18)
p.add_layout(vline_23)
# Add legend entries for vertical lines
p.line([july_18, july_18], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Shutdown Initiation (July 18)')
p.line([july_23, july_23], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Service Restoration (July 23)')
# Configure legend
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
p.legend.title = f'Districts in {region_name.capitalize()}'
p.legend.title_text_font_style = "bold"
p.legend.title_text_font_size = "12pt"
p.legend.label_text_font_size = "10pt"
p.legend.background_fill_alpha = 0.7
p.legend.border_line_color = "gray"
p.legend.border_line_alpha = 0.5
# Add hover tool
p.add_tools(HoverTool(
tooltips=[
("Date", "@date_only{%F}"),
("Number of Tests", "@count{0,0}"),
("District", "@district"),
],
formatters={'@date_only': 'datetime'},
mode='vline'
))
# Configure axes
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Number of Network Performance Tests'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
p.title.text_font_size = '14pt'
# Style the plot
p.grid.grid_line_color = "gray"
p.grid.grid_line_alpha = 0.3
p.background_fill_color = "#f8f9fa"
p.border_fill_color = "white"
return p
# Create and show tabs
tabs = [TabPanel(child=row(get_region_plot(filtered_df[filtered_df['region'] == region], region)),
title=region.capitalize())
for region in top_regions]
output_notebook()
show(Tabs(tabs=tabs, sizing_mode="scale_both"))
/var/folders/k7/tnfx_2n55hd1cf9b5gv3gp640000gn/T/ipykernel_66243/3071509058.py:1: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
import pandas as pd
Percentage Change in Daily Test Counts vs baseline (after July 15)#
Show code cell source
Hide code cell source
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Legend, Span
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from datetime import datetime
# Load the CSV file and get top 8 regions
df = pd.read_csv('../../data/Filtered_NetworkPerformance_balanced.csv', parse_dates=['date_only'])
# Create mapping of valid districts by region
region_district_map = {}
for region in df['region'].unique():
region_data = df[df['region'] == region]
valid_districts = region_data.groupby('district')['count'].sum()
valid_districts = valid_districts[valid_districts > 0].index.tolist()
region_district_map[region] = valid_districts
# Get top 8 regions
top_regions = df['region'].unique()[:8]
filtered_df = df[df['region'].isin(top_regions)]
def calculate_percentage_change(district_data):
# Filter July data
july_data = district_data[district_data['date_only'].dt.month == 7].copy()
# Calculate baseline (average of first 15 days)
baseline = july_data[july_data['date_only'].dt.day <= 15]['count'].mean()
# Calculate percentage change for days after July 15
july_data = july_data[july_data['date_only'].dt.day > 15]
july_data['pct_change'] = ((july_data['count'] - baseline) / baseline) * 100
return july_data
def get_region_plot(region_df, region_name):
p = figure(
title=f"Percentage Change in Daily Tests vs Baseline (After July 15) - {region_name.capitalize()}",
x_axis_type='datetime',
width=1000,
height=600
)
# Get only valid districts for this region
valid_districts = region_district_map[region_name]
districts = sorted([d for d in valid_districts if d in region_df['district'].unique()])
colors = Category20[20] if len(districts) <= 20 else [Category20[20][i % 20] for i in range(len(districts))]
for i, district in enumerate(districts):
district_data = region_df[region_df['district'] == district]
if district_data['count'].sum() > 0:
# Calculate percentage changes for this district
plot_data = calculate_percentage_change(district_data)
if not plot_data.empty: # Only create line if we have data after July 15
source = ColumnDataSource(plot_data)
p.line(
x='date_only',
y='pct_change',
source=source,
line_width=2,
legend_label=f"{district.capitalize()}",
color=colors[i]
)
# Add reference line at 0% (only for the visible date range)
date_range = region_df[region_df['date_only'].dt.day > 15]['date_only']
if not date_range.empty:
p.line(
x=[date_range.min(), date_range.max()],
y=[0, 0],
line_width=2,
color='red',
line_dash='dashed',
legend_label='Baseline'
)
# Add vertical lines for July 18 and July 23
july_18 = datetime(2024, 7, 18) # Adjust year as needed
july_23 = datetime(2024, 7, 23) # Adjust year as needed
vline_18 = Span(location=july_18, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
vline_23 = Span(location=july_23, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
p.add_layout(vline_18)
p.add_layout(vline_23)
# Add legend entries for vertical lines
p.line([july_18, july_18], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Shutdown Initiation(July 18)')
p.line([july_23, july_23], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Service Restoration(July 23)')
# Configure legend
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
p.legend.title = f'Districts in {region_name.capitalize()}'
p.legend.title_text_font_style = "bold"
p.legend.title_text_font_size = "12pt"
p.legend.label_text_font_size = "10pt"
p.legend.background_fill_alpha = 0.7
p.legend.border_line_color = "gray"
p.legend.border_line_alpha = 0.5
# Add hover tool
p.add_tools(HoverTool(
tooltips=[
("Date", "@date_only{%F}"),
("Percentage Change", "@pct_change{0.1}%"),
("Daily Tests", "@count{0,0}"),
("District", "@district"),
],
formatters={'@date_only': 'datetime'},
mode='vline'
))
# Configure axes
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Percentage Change (%)'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
p.title.text_font_size = '14pt'
# Style the plot
p.grid.grid_line_color = "gray"
p.grid.grid_line_alpha = 0.3
p.background_fill_color = "#f8f9fa"
p.border_fill_color = "white"
return p
# Create and show tabs
tabs = [TabPanel(child=row(get_region_plot(filtered_df[filtered_df['region'] == region], region)),
title=region.capitalize())
for region in top_regions]
output_notebook()
show(Tabs(tabs=tabs, sizing_mode="scale_both"))
Show code cell source
Hide code cell source
def export_network_test_data(df, output_path='network_test_metrics.csv'):
"""Export network test data with percentage changes for each district."""
try:
# Initialize list to store processed data
processed_data = []
# Process each region and district
for region in df['region'].unique():
region_data = df[df['region'] == region]
for district in region_data['district'].unique():
district_data = region_data[region_data['district'] == district]
# Filter for July data
july_data = district_data[district_data['date_only'].dt.month == 7].copy()
if not july_data.empty:
# Calculate baseline (average of first 15 days)
baseline = july_data[july_data['date_only'].dt.day <= 15]['count'].mean()
# Process each day's data
for _, row in july_data.iterrows():
pct_change = ((row['count'] - baseline) / baseline * 100) if baseline != 0 else 0
processed_data.append({
'Date': row['date_only'],
'Region': region,
'District': district,
'Daily_Tests': row['count'],
'Baseline_Tests': baseline,
'Percentage_Change': pct_change,
'Is_After_July_15': row['date_only'].day > 15
})
# Convert to DataFrame and export
metrics_df = pd.DataFrame(processed_data)
metrics_df = metrics_df.sort_values(['Region', 'District', 'Date'])
metrics_df.to_csv(output_path, index=False)
print(f"Network test metrics exported to {output_path}")
return metrics_df
except Exception as e:
print(f"Error exporting network test metrics: {str(e)}")
return None
# Example usage:
file_path = '../../data/Filtered_NetworkPerformance_balanced.csv'
df = pd.read_csv(file_path, parse_dates=['date_only'])
network_metrics = export_network_test_data(df, '../../data/network_test_metrics.csv')
Network test metrics exported to ../../data/network_test_metrics.csv
Average download speed tests#
Show code cell source
Hide code cell source
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Legend, Span
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from datetime import datetime
# Load the CSV file and get top 8 regions
df = pd.read_csv('../../data/Filtered_NetworkPerformance_balanced.csv', parse_dates=['date_only'])
# Calculate baseline users (first 15 days) for each district
district_baselines = {}
for district in df['district'].unique():
district_data = df[df['district'] == district]
first_15_days = district_data[district_data['date_only'].dt.day <= 15]
avg_users = first_15_days['count'].mean()
district_baselines[district] = avg_users
# Apply threshold and adjust download speeds
for district in df['district'].unique():
district_mask = df['district'] == district
baseline = district_baselines[district]
# Set download speed to 0 if users are less than 90% of baseline
df.loc[district_mask & (df['count'] < 0.9 * baseline), 'av_val_download_mbps'] = 0
# Create mapping of valid districts by region
region_district_map = {}
for region in df['region'].unique():
region_data = df[df['region'] == region]
valid_districts = region_data.groupby('district')['av_val_download_mbps'].mean()
valid_districts = valid_districts[valid_districts > 0].index.tolist()
region_district_map[region] = valid_districts
# Get top 8 regions
top_regions = df['region'].unique()[:8]
filtered_df = df[df['region'].isin(top_regions)]
def get_region_plot(region_df, region_name):
p = figure(
title=f"Average Daily Download Speed in {region_name.capitalize()} (Adjusted for Low Usage)",
x_axis_type='datetime',
width=1000,
height=600
)
# Get only valid districts for this region
valid_districts = region_district_map[region_name]
districts = sorted([d for d in valid_districts if d in region_df['district'].unique()])
colors = Category20[20] if len(districts) <= 20 else [Category20[20][i % 20] for i in range(len(districts))]
for i, district in enumerate(districts):
district_data = region_df[region_df['district'] == district]
if district_data['av_val_download_mbps'].mean() > 0:
source = ColumnDataSource(district_data)
p.line(
x='date_only',
y='av_val_download_mbps',
source=source,
line_width=2,
legend_label=district.capitalize(),
color=colors[i]
)
# Add vertical lines for shutdown period
july_18 = datetime(2024, 7, 18)
july_23 = datetime(2024, 7, 23)
vline_18 = Span(location=july_18, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
vline_23 = Span(location=july_23, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
p.add_layout(vline_18)
p.add_layout(vline_23)
# Add legend entries for vertical lines
p.line([july_18, july_18], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Shutdown Initiation (July 18)')
p.line([july_23, july_23], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Service Restoration (July 23)')
# Configure legend
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
p.legend.title = f'Districts in {region_name.capitalize()}'
p.legend.title_text_font_style = "bold"
p.legend.title_text_font_size = "12pt"
p.legend.label_text_font_size = "10pt"
p.legend.background_fill_alpha = 0.7
p.legend.border_line_color = "gray"
p.legend.border_line_alpha = 0.5
# Add hover tool
p.add_tools(HoverTool(
tooltips=[
("Date", "@date_only{%F}"),
("Average Download Speed (Mbps)", "@av_val_download_mbps{0.00}"),
("Number of Tests", "@count"),
],
formatters={'@date_only': 'datetime'},
mode='vline'
))
# Configure axes
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Download Speed (Mbps)'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
p.title.text_font_size = '14pt'
# Style the plot
p.grid.grid_line_color = "gray"
p.grid.grid_line_alpha = 0.3
p.background_fill_color = "#f8f9fa"
p.border_fill_color = "white"
return p
# Create and show tabs
tabs = [TabPanel(child=row(get_region_plot(filtered_df[filtered_df['region'] == region], region)),
title=region.capitalize())
for region in top_regions]
output_notebook()
show(Tabs(tabs=tabs, sizing_mode="scale_both"))
Percentage Change in Average Download Speeds vs baseline (after July 15)#
Show code cell source
Hide code cell source
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Legend, Span
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from datetime import datetime
# Load the CSV file and get top 8 regions
df = pd.read_csv('../../data/Filtered_NetworkPerformance_balanced.csv', parse_dates=['date_only'])
# Calculate baseline users (first 15 days) for each district and apply threshold
district_baselines = {}
for district in df['district'].unique():
district_data = df[df['district'] == district]
first_15_days = district_data[district_data['date_only'].dt.day <= 15]
avg_users = first_15_days['count'].mean()
district_baselines[district] = avg_users
# Create a copy of download speeds before applying threshold
df['original_download'] = df['av_val_download_mbps']
# Apply user threshold and adjust download speeds
for district in df['district'].unique():
district_mask = df['district'] == district
baseline = district_baselines[district]
# Set download speed to 0 if users are less than 90% of baseline
df.loc[district_mask & (df['count'] < 0.9 * baseline), 'av_val_download_mbps'] = 0
# Create mapping of valid districts by region
region_district_map = {}
for region in df['region'].unique():
region_data = df[df['region'] == region]
valid_districts = region_data.groupby('district')['count'].sum()
valid_districts = valid_districts[valid_districts > 0].index.tolist()
region_district_map[region] = valid_districts
# Get top 8 regions
top_regions = df['region'].unique()[:8]
filtered_df = df[df['region'].isin(top_regions)]
def calculate_download_percentage_change(district_data):
# Filter July data
july_data = district_data[district_data['date_only'].dt.month == 7].copy()
# Calculate baseline download speed (average of first 15 days)
baseline = july_data[july_data['date_only'].dt.day <= 15]['av_val_download_mbps'].mean()
# Calculate percentage change for days after July 15
july_data = july_data[july_data['date_only'].dt.day > 15]
july_data['pct_change'] = ((july_data['av_val_download_mbps'] - baseline) / baseline) * 100
return july_data
def get_region_plot(region_df, region_name):
p = figure(
title=f"Percentage Change in Download Speed vs Baseline (After July 15) - {region_name.capitalize()}",
x_axis_type='datetime',
width=1000,
height=600
)
# Get only valid districts for this region
valid_districts = region_district_map[region_name]
districts = sorted([d for d in valid_districts if d in region_df['district'].unique()])
colors = Category20[20] if len(districts) <= 20 else [Category20[20][i % 20] for i in range(len(districts))]
for i, district in enumerate(districts):
district_data = region_df[region_df['district'] == district]
if district_data['av_val_download_mbps'].mean() > 0:
# Calculate percentage changes for this district
plot_data = calculate_download_percentage_change(district_data)
if not plot_data.empty: # Only create line if we have data after July 15
source = ColumnDataSource(plot_data)
p.line(
x='date_only',
y='pct_change',
source=source,
line_width=2,
legend_label=f"{district.capitalize()}",
color=colors[i]
)
# Add reference line at 0%
date_range = region_df[region_df['date_only'].dt.day > 15]['date_only']
if not date_range.empty:
p.line(
x=[date_range.min(), date_range.max()],
y=[0, 0],
line_width=2,
color='red',
line_dash='dashed',
legend_label='Baseline'
)
# Add vertical lines for shutdown period
july_18 = datetime(2024, 7, 18)
july_23 = datetime(2024, 7, 23)
vline_18 = Span(location=july_18, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
vline_23 = Span(location=july_23, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
p.add_layout(vline_18)
p.add_layout(vline_23)
# Add legend entries for vertical lines
p.line([july_18, july_18], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Shutdown Initiation (July 18)')
p.line([july_23, july_23], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Service Restoration (July 23)')
# Configure legend
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
p.legend.title = f'Districts in {region_name.capitalize()}'
p.legend.title_text_font_style = "bold"
p.legend.title_text_font_size = "12pt"
p.legend.label_text_font_size = "10pt"
p.legend.background_fill_alpha = 0.7
p.legend.border_line_color = "gray"
p.legend.border_line_alpha = 0.5
# Add hover tool
p.add_tools(HoverTool(
tooltips=[
("Date", "@date_only{%F}"),
("Percentage Change", "@pct_change{0.1}%"),
("Download Speed (Mbps)", "@av_val_download_mbps{0.00}"),
("Number of Tests", "@count{0,0}"),
("District", "@district"),
],
formatters={'@date_only': 'datetime'},
mode='vline'
))
# Configure axes
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Download Speed Change (%)'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
p.title.text_font_size = '14pt'
# Style the plot
p.grid.grid_line_color = "gray"
p.grid.grid_line_alpha = 0.3
p.background_fill_color = "#f8f9fa"
p.border_fill_color = "white"
return p
# Create and show tabs
tabs = [TabPanel(child=row(get_region_plot(filtered_df[filtered_df['region'] == region], region)),
title=region.capitalize())
for region in top_regions]
output_notebook()
show(Tabs(tabs=tabs, sizing_mode="scale_both"))
Average Upload Speed Test#
Show code cell source
Hide code cell source
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Legend, Span
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from datetime import datetime
# Load the CSV file and get top 8 regions
df = pd.read_csv('../../data/Filtered_NetworkPerformance_balanced.csv', parse_dates=['date_only'])
# Calculate baseline users (first 15 days) for each district
district_baselines = {}
for district in df['district'].unique():
district_data = df[df['district'] == district]
first_15_days = district_data[district_data['date_only'].dt.day <= 15]
avg_users = first_15_days['count'].mean()
district_baselines[district] = avg_users
# Apply threshold and adjust upload speeds
for district in df['district'].unique():
district_mask = df['district'] == district
baseline = district_baselines[district]
# Set upload speed to 0 if users are less than 90% of baseline
df.loc[district_mask & (df['count'] < 0.9 * baseline), 'av_val_upload_mbps'] = 0
# Create mapping of valid districts by region
region_district_map = {}
for region in df['region'].unique():
region_data = df[df['region'] == region]
valid_districts = region_data.groupby('district')['av_val_upload_mbps'].mean()
valid_districts = valid_districts[valid_districts > 0].index.tolist()
region_district_map[region] = valid_districts
# Get top 8 regions
top_regions = df['region'].unique()[:8]
filtered_df = df[df['region'].isin(top_regions)]
def get_region_plot(region_df, region_name):
p = figure(
title=f"Average Daily Upload Speed in {region_name.capitalize()} (Adjusted for Low Usage)",
x_axis_type='datetime',
width=1000,
height=600
)
# Get only valid districts for this region
valid_districts = region_district_map[region_name]
districts = sorted([d for d in valid_districts if d in region_df['district'].unique()])
colors = Category20[20] if len(districts) <= 20 else [Category20[20][i % 20] for i in range(len(districts))]
for i, district in enumerate(districts):
district_data = region_df[region_df['district'] == district]
if district_data['av_val_upload_mbps'].mean() > 0:
source = ColumnDataSource(district_data)
p.line(
x='date_only',
y='av_val_upload_mbps',
source=source,
line_width=2,
legend_label=district.capitalize(),
color=colors[i]
)
# Add vertical lines for shutdown period
july_18 = datetime(2024, 7, 18)
july_23 = datetime(2024, 7, 23)
vline_18 = Span(location=july_18, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
vline_23 = Span(location=july_23, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
p.add_layout(vline_18)
p.add_layout(vline_23)
# Add legend entries for vertical lines
p.line([july_18, july_18], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Shutdown Initiation (July 18)')
p.line([july_23, july_23], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Service Restoration (July 23)')
# Configure legend
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
p.legend.title = f'Districts in {region_name.capitalize()}'
p.legend.title_text_font_style = "bold"
p.legend.title_text_font_size = "12pt"
p.legend.label_text_font_size = "10pt"
p.legend.background_fill_alpha = 0.7
p.legend.border_line_color = "gray"
p.legend.border_line_alpha = 0.5
# Add hover tool
p.add_tools(HoverTool(
tooltips=[
("Date", "@date_only{%F}"),
("Average Upload Speed (Mbps)", "@av_val_upload_mbps{0.00}"),
("Number of Tests", "@count"),
],
formatters={'@date_only': 'datetime'},
mode='vline'
))
# Configure axes
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Upload Speed (Mbps)'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
p.title.text_font_size = '14pt'
# Style the plot
p.grid.grid_line_color = "gray"
p.grid.grid_line_alpha = 0.3
p.background_fill_color = "#f8f9fa"
p.border_fill_color = "white"
return p
# Create and show tabs
tabs = [TabPanel(child=row(get_region_plot(filtered_df[filtered_df['region'] == region], region)),
title=region.capitalize())
for region in top_regions]
output_notebook()
show(Tabs(tabs=tabs, sizing_mode="scale_both"))
Percentage change in upload speed vs baseline (after July 15)#
Show code cell source
Hide code cell source
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Legend, Span
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from datetime import datetime
# Load the CSV file and get top 8 regions
df = pd.read_csv('../../data/Filtered_NetworkPerformance_balanced.csv', parse_dates=['date_only'])
# Calculate baseline users (first 15 days) for each district and apply threshold
district_baselines = {}
for district in df['district'].unique():
district_data = df[df['district'] == district]
first_15_days = district_data[district_data['date_only'].dt.day <= 15]
avg_users = first_15_days['count'].mean()
district_baselines[district] = avg_users
# Create a copy of upload speeds before applying threshold
df['original_upload'] = df['av_val_upload_mbps']
# Apply user threshold and adjust upload speeds
for district in df['district'].unique():
district_mask = df['district'] == district
baseline = district_baselines[district]
# Set upload speed to 0 if users are less than 90% of baseline
df.loc[district_mask & (df['count'] < 0.9 * baseline), 'av_val_upload_mbps'] = 0
# Create mapping of valid districts by region
region_district_map = {}
for region in df['region'].unique():
region_data = df[df['region'] == region]
valid_districts = region_data.groupby('district')['count'].sum()
valid_districts = valid_districts[valid_districts > 0].index.tolist()
region_district_map[region] = valid_districts
# Get top 8 regions
top_regions = df['region'].unique()[:8]
filtered_df = df[df['region'].isin(top_regions)]
def calculate_upload_percentage_change(district_data):
# Filter July data
july_data = district_data[district_data['date_only'].dt.month == 7].copy()
# Calculate baseline upload speed (average of first 15 days)
baseline = july_data[july_data['date_only'].dt.day <= 15]['av_val_upload_mbps'].mean()
# Calculate percentage change for days after July 15
july_data = july_data[july_data['date_only'].dt.day > 15]
july_data['pct_change'] = ((july_data['av_val_upload_mbps'] - baseline) / baseline) * 100
return july_data
def get_region_plot(region_df, region_name):
p = figure(
title=f"Percentage Change in Upload Speed vs Baseline (After July 15) - {region_name.capitalize()}",
x_axis_type='datetime',
width=1000,
height=600
)
# Get only valid districts for this region
valid_districts = region_district_map[region_name]
districts = sorted([d for d in valid_districts if d in region_df['district'].unique()])
colors = Category20[20] if len(districts) <= 20 else [Category20[20][i % 20] for i in range(len(districts))]
for i, district in enumerate(districts):
district_data = region_df[region_df['district'] == district]
if district_data['av_val_upload_mbps'].mean() > 0:
# Calculate percentage changes for this district
plot_data = calculate_upload_percentage_change(district_data)
if not plot_data.empty: # Only create line if we have data after July 15
source = ColumnDataSource(plot_data)
p.line(
x='date_only',
y='pct_change',
source=source,
line_width=2,
legend_label=f"{district.capitalize()}",
color=colors[i]
)
# Add reference line at 0%
date_range = region_df[region_df['date_only'].dt.day > 15]['date_only']
if not date_range.empty:
p.line(
x=[date_range.min(), date_range.max()],
y=[0, 0],
line_width=2,
color='red',
line_dash='dashed',
legend_label='Baseline'
)
# Add vertical lines for shutdown period
july_18 = datetime(2024, 7, 18)
july_23 = datetime(2024, 7, 23)
vline_18 = Span(location=july_18, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
vline_23 = Span(location=july_23, dimension='height', line_color='black',
line_width=2, line_dash='dashed')
p.add_layout(vline_18)
p.add_layout(vline_23)
# Add legend entries for vertical lines
p.line([july_18, july_18], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Shutdown Initiation (July 18)')
p.line([july_23, july_23], [0, 0], line_color='black', line_width=2,
line_dash='dashed', legend_label='Internet Service Restoration (July 23)')
# Configure legend
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
p.legend.title = f'Districts in {region_name.capitalize()}'
p.legend.title_text_font_style = "bold"
p.legend.title_text_font_size = "12pt"
p.legend.label_text_font_size = "10pt"
p.legend.background_fill_alpha = 0.7
p.legend.border_line_color = "gray"
p.legend.border_line_alpha = 0.5
# Add hover tool
p.add_tools(HoverTool(
tooltips=[
("Date", "@date_only{%F}"),
("Percentage Change", "@pct_change{0.1}%"),
("Upload Speed (Mbps)", "@av_val_upload_mbps{0.00}"),
("Number of Tests", "@count{0,0}"),
("District", "@district"),
],
formatters={'@date_only': 'datetime'},
mode='vline'
))
# Configure axes
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Upload Speed Change (%)'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
p.title.text_font_size = '14pt'
# Style the plot
p.grid.grid_line_color = "gray"
p.grid.grid_line_alpha = 0.3
p.background_fill_color = "#f8f9fa"
p.border_fill_color = "white"
return p
# Create and show tabs
tabs = [TabPanel(child=row(get_region_plot(filtered_df[filtered_df['region'] == region], region)),
title=region.capitalize())
for region in top_regions]
output_notebook()
show(Tabs(tabs=tabs, sizing_mode="scale_both"))