Space2Stats Walkthrough

Space2Stats Walkthrough#

This notebook walks through an example that explores the Space2Stats Metadata, and then uses the API to fetch flood and climate statistics for various provinces within a country.

# !pip install space2stats-client matplotlib contextily plotnine

First, we need to import the necessary libraries and set up the API client.

import pandas as pd
import geopandas as gpd
from shapely import from_geojson
import matplotlib.pyplot as plt # pip install matplotlib contextily
import contextily as ctx
from space2stats_client import Space2StatsClient

# Initialize the client
client = Space2StatsClient()

1. Define Areas of Interest#

We will define our AOIs by fetching admin boundaries from the GeoBoundaries project.

# Try fetching the boundaries
ISO3 = "SSD" # South Sudan
ADM = "ADM2" # Level 2 administrative boundaries
adm_boundaries = client.fetch_admin_boundaries(ISO3, ADM)
adm_boundaries.head()

adm_boundaries.plot()

2. Query Metadata#

Each dataset in Space2Stats is stored as a STAC item. Metadata for each item can be explored through the following browser.

The get_topics function retrieves a table with key details for each dataset, along with an item identifier (item_id) that can be used to query fields from each dataset.

topics = client.get_topics()
pd.options.display.max_colwidth = None
topics

We can extract additional metadata like fields and descriptions using the item id.

properties = client.get_properties("flood_exposure_15cm_1in100")
properties

3. Extract H3 Data#

Let’s work with the subset of fields from the flood exposure item: ['pop', 'pop_flood', 'pop_flood_pct']

flood_vars = ['pop', 'pop_flood', 'pop_flood_pct']

client.get_summary?

Run API Calls

df = client.get_summary(
    gdf=adm_boundaries,                     # Area of Interest
    spatial_join_method="centroid",         # Spatial join method (between h3 cells and each feature)
    fields=flood_vars,                      # Fields from Space2Stats to query
    geometry="polygon"                      # Whether to return the geometry of the hexagons
)

pd.reset_option('display.max_colwidth')
df.head()

Check that there are no duplicate hexagon ids

df['hex_id'].duplicated().sum()

Convert geometry column from geojson into shapely polygons

df["geometry"] = df["geometry"].apply(lambda geom: from_geojson(geom))

# Convert dataframe to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
gdf.head()

Map H3 Data#

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(ax=ax, column="pop_flood", 
         legend=True, cmap="Reds", alpha=0.75, 
         scheme="naturalbreaks", k=5, 
         legend_kwds=dict(title='Total Pop. Exposed', fmt="{:,.0f}"),
         linewidth=0)
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.title("Population Exposed to Floods (>15cm, 1 in 100 years)", fontsize=16)
plt.axis("off")
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(ax=ax, column="pop_flood_pct", 
         legend=True, cmap="Reds", alpha=0.75, 
         scheme="equal_interval", k=5, 
         legend_kwds=dict(title='% Pop. Exposed', fmt="{:.0%}"),
         linewidth=0)
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.title("% of Population Exposed to Floods (>15cm, 1 in 100 years)", fontsize=16)
plt.axis("off")
plt.show()

4. Extract Admin Summaries#

adm_boundaries_zs = client.get_aggregate(
    gdf=adm_boundaries,                             # Area of Interest
    spatial_join_method="centroid",                 # Spatial join method (between h3 cells and each feature)
    fields=['pop', 'pop_flood'],                    # Fields from Space2Stats to query
    aggregation_type="sum"                          # Aggregation type
)

adm_boundaries_zs.head()

Recalculate share of population exposed with aggregate data

adm_boundaries_zs.loc[:, "pop_flood_pct"] = adm_boundaries_zs["pop_flood"] / adm_boundaries_zs["pop"]

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
adm_boundaries_zs.plot(
    ax=ax, column="pop_flood", legend=True, 
    cmap="Reds", scheme="natural_breaks", 
    k=5, legend_kwds=dict(title='Total Pop. Exposed', fmt="{:,.0f}"),
    linewidth=0.2, edgecolor='black')
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.title("Population Exposed to Floods (>15cm, 1 in 100 years)", fontsize=16)
plt.axis("off")
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
adm_boundaries_zs.plot(
    ax=ax, column="pop_flood_pct", legend=True, 
    cmap="Reds", scheme="natural_breaks", 
    k=5, legend_kwds=dict(title='% Pop. Exposed', fmt="{:.0%}"),
    linewidth=0.2, edgecolor='black')
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.title("% of Population Exposed to Floods (>15cm, 1 in 100 years)", fontsize=16)
plt.axis("off")
plt.show()

List top 10 counties by population exposed

table = adm_boundaries_zs.sort_values('pop_flood', ascending=False).head(10)[['shapeName', 'pop_flood', 'pop_flood_pct']].rename(
    columns={
        'shapeName': 'Province'
        })
table.loc[:, "Population Exposed"] = table.loc[:, "pop_flood"].apply(lambda x: f"{x:,.0f}")
table.loc[:, "Population Exposed (%)"] = table.loc[:, "pop_flood_pct"].apply(lambda x: f"{x:.2%}")
table.reset_index(drop=True, inplace=True)
display(table[['Province', 'Population Exposed', 'Population Exposed (%)']])

5. Extract Climate Data#

client.get_timeseries_fields()

ISO3 = "LKA"  # Sri Lanka 
ADM = "ADM2"
adm_boundaries_lka = client.fetch_admin_boundaries(ISO3, ADM)

df_ts = client.get_timeseries(
    gdf=adm_boundaries_lka,                 # Area of Interest
    spatial_join_method="centroid",         # Spatial join method (between h3 cells and each feature)
    fields=['spi'],                         # *Time-series fields from Space2Stats to query
    start_date="2019-01-01",                # Start date (will default to earliest date available)
    geometry="polygon"                      # Whether to return the geometry of the hexagons
)

df_ts.head()

## Curently, get_timeseries returns timeseries data for each admin with an area ID,
## but we lost the attributes of the admin areas :(
## For now, we will merge the attributes back to the timeseries data 
## but we will fix this :)
df_ts = df_ts.merge(
    adm_boundaries_lka.drop(columns=["geometry"]), left_on="area_id", right_index=True, how="left"
)

# Convert date strings to datetime objects
df_ts['date'] = pd.to_datetime(df_ts['date'])

# Extract year from the date
df_ts['year'] = df_ts['date'].dt.year

# Filter data for 2024
df_filter = df_ts.loc[df_ts['year'] == 2024].copy()

# Convert geometry to shapely objects
df_filter["geometry"] = df_filter["geometry"].apply(lambda geom: from_geojson(geom))

# Convert dataframe to GeoDataFrame
gdf = gpd.GeoDataFrame(df_filter, geometry="geometry", crs="EPSG:4326")
gdf['ym'] = gdf['date'].dt.strftime('%Y-%m')

from mizani.breaks import date_breaks
from mizani.formatters import date_format
from plotnine import (
    ggplot,
    aes,
    geom_bar,
    geom_map,
    coord_fixed,
    facet_wrap,
    scale_fill_distiller,
    element_rect,
    theme_void,
    theme_minimal,
    theme,
    labs,
    element_text,
    scale_y_continuous,
    scale_x_datetime
)

Map Monthly SPI (Drought Index)#

(
    ggplot(gdf)
    + geom_map(aes(fill="spi"), size=0)
    + scale_fill_distiller(type="div", palette="RdBu", name="SPI", limits=(-2, 2))
    + facet_wrap(
        "ym",
        ncol=4,
    )
    + coord_fixed(expand=False)
    + theme_void()
    + theme(
        figure_size=(8, 8),
        plot_margin=0.01,
        plot_background=element_rect(fill="white"),
        panel_spacing=0.025
    )
    + labs(title="Monthly SPI (Standardized Precipitation Index), Sri Lanka")
)

National Average#

df_average = df_ts.groupby('date')['spi'].agg(['mean']).reset_index()

font = "Roboto"
p = (
    ggplot(df_average, aes(x="date", y="mean", fill="mean"))
    + geom_bar(alpha=0.8, stat="identity", color="black", width=20)
    + labs(
        x="",
        subtitle="Standardised Precipitation Index",
        title="Drought Index",
        y="",
        caption="Source: Space2Stats",
    )
    + theme_minimal()
    + theme(
        plot_background=element_rect(fill="white"),
        figure_size=(8, 6),
        text=element_text(family=font, size=11),
        plot_title=element_text(family=font, size=14, weight="bold"),
        legend_position="none",
    )
    + scale_fill_distiller(
        type="div", palette="RdYlBu", direction=1, limits=(-2, 2)
    )
    + scale_y_continuous(limits=(-2, 2))
    + scale_x_datetime(
        breaks=date_breaks(width="1 year"), labels=date_format("%Y")
    )
)
p

Extract Drought Events per District/Year#

df_ts.head()

# Set an extereme drought threshold
drought_threshold = -2

# Create a binary column indicating extreme drought days
df_ts['extreme_drought'] = (df_ts['spi'] <= drought_threshold).astype(int)

# Group by region and year, then count extreme drought days
yearly_drought = df_ts.groupby(['shapeName', 'year'])['extreme_drought'].sum().reset_index()

# Pivot the table to have years as columns
drought_pivot = yearly_drought.pivot(index='shapeName', columns='year', values='extreme_drought')

# Rename columns for clarity
drought_pivot.columns = [f'{year}' for year in drought_pivot.columns]

# Add total drought days column
drought_pivot['Total Drought Events'] = drought_pivot.sum(axis=1)

# Sort by total drought days in descending order
drought_pivot = drought_pivot.sort_values('Total Drought Events', ascending=False)

# Reset index to make 'region' a regular column
result_table = drought_pivot.reset_index().rename(columns={'shapeName': 'District'})

# Display the resulting table
display(result_table)

adm_boundaries_lka = adm_boundaries_lka.merge(result_table, left_on="shapeName", right_on="District", how="left")

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
adm_boundaries_lka.plot(
    ax=ax, column="Total Drought Events", legend=True, 
    cmap="Reds", scheme="natural_breaks", 
    k=5, legend_kwds=dict(title='Total Drought Events'),
    linewidth=0.2, edgecolor='black')
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.title("Monthly Droughts (SPI<2), Sri Lanka", fontsize=16)
plt.axis("off")
plt.show()