Flood Exposure Notebook

Flood Exposure Notebook#

Explore S2S Metadata and API

This notebook walks through an example that explores the Space2Stats Metadata, and then uses the API to fetch flood data for various provinces within a country.

import pandas as pd
import geopandas as gpd
import requests
from shapely import from_geojson
from pystac import Catalog
from typing import Dict, Literal, List, Optional
import matplotlib.pyplot as plt # pip install matplotlib contextily
import contextily as ctx

BASE_URL = "https://space2stats.ds.io"
FIELDS_ENDPOINT = f"{BASE_URL}/fields"
SUMMARY_ENDPOINT = f"{BASE_URL}/summary"
AGGREGATION_ENDPOINT = f"{BASE_URL}/aggregate"
catalog = Catalog.from_file("https://raw.githubusercontent.com/worldbank/DECAT_Space2Stats/refs/heads/main/space2stats_api/src/space2stats_ingest/METADATA/stac/catalog.json")

Helper Functions#

The following python functions are wrappers to help interact with the API.

def get_topics(catalog: Catalog) -> pd.DataFrame:
    """Get a table of items (dataset themes/topics) from the STAC catalog."""
    items = catalog.get_all_items()
    items = list(items)
    topics = [{i.id: {k: v for k, v in i.properties.items() if k in ["name", "description", "source_data"]}} for i in items]
    topics = [pd.DataFrame(t) for t in topics]
    topics = pd.concat(topics, axis=1)
    topics.index.name = "Item ID"
    return topics.transpose()

def get_fields(item_id: str, catalog: Catalog) -> Dict:
    """Get a table with a description of variables for a given dataset (item)."""
    collection = next(catalog.get_collections())
    item = collection.get_item(item_id)
    fields = item.properties["table:columns"]
    return pd.DataFrame(fields)

def fetch_admin_boundaries(iso3: str, adm: str) -> gpd.GeoDataFrame:
    """Fetch administrative boundaries from GeoBoundaries API."""
    url = f"https://www.geoboundaries.org/api/current/gbOpen/{iso3}/{adm}/"
    res = requests.get(url).json()
    return gpd.read_file(res["gjDownloadURL"])

def get_summary(
        gdf: gpd.GeoDataFrame, 
        spatial_join_method: Literal["touches", "centroid", "within"], 
        fields: List[str], 
        geometry: Optional[Literal["polygon", "point"]] = None
        ) -> pd.DataFrame:
    """Extract h3 level data from Space2Stats for a GeoDataFrame.

    Parameters
    ----------
    gdf : GeoDataFrame
        The Areas of Interest

    spatial_join_method : ["touches", "centroid", "within"]
        The method to use for performing the spatial join between the AOI and H3 cells
            - "touches": Includes H3 cells that touch the AOI
            - "centroid": Includes H3 cells where the centroid falls within the AOI
            - "within": Includes H3 cells entirely within the AOI

    fields : List[str]
        A list of field names to retrieve from the statistics table.

    geometry : Optional["polygon", "point"]
        Specifies if the H3 geometries should be included in the response. It can be either "polygon" or "point". If None, geometries are not included

    Returns
    -------
    DataFrame
        A DataFrame with the requested fields for each H3 cell that intersects with the AOIs.
    """
    res_all = {}
    for idx, row in gdf.iterrows():
        request_payload = {
            "aoi": {
                "type": "Feature",
                "geometry": row.geometry.__geo_interface__,
                "properties": {},
            },
            "spatial_join_method": spatial_join_method,
            "fields": fields,
            "geometry": geometry,
        }
        response = requests.post(SUMMARY_ENDPOINT, json=request_payload)

        if response.status_code != 200:
            raise Exception(f"Failed to get summary: {response.text}")
        
        summary_data = response.json()
        if not summary_data:
            print(f"Failed to get summary for {idx}")
            summary_data = pd.DataFrame()  # Return an empty DataFrame if no data

        df = pd.DataFrame(summary_data)
        res_all[idx] = df
    
    res_all = pd.concat(res_all, names=["index_gdf", "index_h3"])
    res_all = res_all.reset_index()
    gdf_copy = gdf.copy()
    gdf_copy.drop(columns=["geometry"], inplace=True)
    res_all = gdf_copy.merge(res_all, left_index=True, right_on="index_gdf")
    return res_all

def get_aggregate(
        gdf: gpd.GeoDataFrame, 
        spatial_join_method: Literal["touches", "centroid", "within"],
        fields: list, 
        aggregation_type: Literal["sum", "avg", "count", "max", "min"]
    ) -> pd.DataFrame:
    """Extract summary statistic from underlying H3 Space2Stats data.

    Parameters
    ----------
    gdf : GeoDataFrame
        The Areas of Interest

    spatial_join_method : ["touches", "centroid", "within"]
        The method to use for performing the spatial join between the AOI and H3 cells
            - "touches": Includes H3 cells that touch the AOI
            - "centroid": Includes H3 cells where the centroid falls within the AOI
            - "within": Includes H3 cells entirely within the AOI

    fields : List[str]
        A list of field names to retrieve from the statistics table.

    aggregation_type : ["sum", "avg", "count", "max", "min"]
        Statistical function to apply to each field per AOI.

    Returns
    -------
    DataFrame
        A DataFrame with the requested fields for each H3 cell that intersects with the AOIs.
    
    """
    res_all = []
    for idx, row in gdf.iterrows():
        request_payload = {
            "aoi": {
                "type": "Feature",
                "geometry": row.geometry.__geo_interface__,
                "properties": {},
            },
            "spatial_join_method": spatial_join_method,
            "fields": fields,
            "aggregation_type": aggregation_type,
        }
        response = requests.post(AGGREGATION_ENDPOINT, json=request_payload)

        if response.status_code != 200:
            raise Exception(f"Failed to get aggregate: {response.text}")
        
        aggregate_data = response.json()
        if not aggregate_data:
            print(f"Failed to get summary for {idx}")
            aggregate_data = pd.DataFrame()  # Return an empty DataFrame if no data

        df = pd.DataFrame(aggregate_data, index=[idx])
        res_all.append(df)
    
    res_all = pd.concat(res_all)
    gdf_copy = gdf.copy()
    res_all = gdf_copy.join(res_all)
    return res_all

Query Metadata#

Each dataset in Space2Stats is stored as a STAC item. Metadata for each item can be explored through the following browser.

The get_topics function retrieves a list of dictionaries with key details for each dataset. The keys in each dictionary are the item ids.

topics = get_topics(catalog)
pd.options.display.max_colwidth = None
topics

Item ID	name	description	source_data
space2stats_population_2020	Population	Gridded population disaggregated by gender.	WorldPop gridded population, 2020, Unconstrained, UN-Adjusted, https://www.worldpop.org/methods/top_down_constrained_vs_unconstrained/
flood_exposure_15cm_1in100	Population Exposed to Floods	Population where flood depth is greater than 15 cm, 1-in-100 return period.	Fathom 3.0 High Resolution Global Flood Maps Including Climate Scenarios, https://datacatalog.worldbank.org/search/dataset/0065653/Fathom-3-0---High-Resolution-Global-Flood-Maps-Including-Climate-Scenarios
nighttime_lights_2013	Nighttime Lights	Sum of luminosity values measured by monthly composites from VIIRS satellite.	World Bank - Light Every Night, https://registry.opendata.aws/wb-light-every-night/
urbanization_ghssmod	Urbanization by population and by area	Urbanization is analyzed using the GHS-SMOD dataset, including comparisons with population	Global Human Settlement Layer (https://human-settlement.emergency.copernicus.eu/degurbaDefinitions.php)

We can extract additional metadata like fields and descriptions using the item id.

fields = get_fields("flood_exposure_15cm_1in100", catalog)
fields

	name	description	type
0	hex_id	H3 unique identifier	object
1	pop	Sum of Gridded Population, 2020	float32
2	pop_flood	Sum of population exposed to floods greater than 15 cm, 1 in 100 return period	float64
3	pop_flood_pct	Percent of population exposed to floods greater than 15 cm, 1 in 100 return period	float64

Alternatively, we can also explore the fields avaialble via the API fields endpoint:

response = requests.get(FIELDS_ENDPOINT)
if response.status_code != 200:
    raise Exception(f"Failed to get fields: {response.text}")

available_fields = response.json()
print("Available Fields:", available_fields)

Available Fields: ['ghs_total_pop', 'ogc_fid', 'sum_pop_f_0_2020', 'sum_pop_f_10_2020', 'sum_pop_f_15_2020', 'sum_pop_f_1_2020', 'sum_pop_f_20_2020', 'sum_pop_f_25_2020', 'sum_pop_f_30_2020', 'sum_pop_f_35_2020', 'sum_pop_f_40_2020', 'sum_pop_f_45_2020', 'sum_pop_f_50_2020', 'sum_pop_f_55_2020', 'sum_pop_f_5_2020', 'sum_pop_f_60_2020', 'sum_pop_f_65_2020', 'sum_pop_f_70_2020', 'sum_pop_f_75_2020', 'sum_pop_f_80_2020', 'sum_pop_m_0_2020', 'sum_pop_m_10_2020', 'sum_pop_m_15_2020', 'sum_pop_m_1_2020', 'sum_pop_m_20_2020', 'sum_pop_m_25_2020', 'sum_pop_m_30_2020', 'sum_pop_m_35_2020', 'sum_pop_m_40_2020', 'sum_pop_m_45_2020', 'sum_pop_m_50_2020', 'sum_pop_m_55_2020', 'sum_pop_m_5_2020', 'sum_pop_m_60_2020', 'sum_pop_m_65_2020', 'sum_pop_m_70_2020', 'sum_pop_m_75_2020', 'sum_pop_m_80_2020', 'sum_pop_m_2020', 'sum_pop_f_2020', 'sum_pop_2020', 'pop', 'pop_flood', 'pop_flood_pct', 'ghs_11_count', 'ghs_12_count', 'ghs_13_count', 'ghs_21_count', 'ghs_22_count', 'ghs_23_count', 'ghs_30_count', 'ghs_total_count', 'ghs_11_pop', 'ghs_12_pop', 'ghs_13_pop', 'ghs_21_pop', 'ghs_22_pop', 'ghs_23_pop', 'ghs_30_pop']

Extract H3 Data#

Let’s work with the subset of fields from the flood exposure item: ['pop', 'pop_flood', 'pop_flood_pct']

flood_vars = ['pop', 'pop_flood', 'pop_flood_pct']

We will define our AOIs by fetching admin boundaries from the GeoBoundaries project.

ISO3 = "SSD" # South Sudan
ADM = "ADM2" # Level 2 administrative boundaries
adm_boundaries = fetch_admin_boundaries(ISO3, ADM)
adm_boundaries.plot()

<Axes: >

../_images/e60248ff841f6dd4783ccf88cb0daed410c032f179e420c43d4801377eee41ba.png

get_summary?

Signature:
get_summary(
    gdf: geopandas.geodataframe.GeoDataFrame,
    spatial_join_method: Literal['touches', 'centroid', 'within'],
    fields: List[str],
    geometry: Optional[Literal['polygon', 'point']] = None,
) -> pandas.core.frame.DataFrame
Docstring:
Extract h3 level data from Space2Stats for a GeoDataFrame.

Parameters
----------
gdf : GeoDataFrame
    The Areas of Interest

spatial_join_method : ["touches", "centroid", "within"]
    The method to use for performing the spatial join between the AOI and H3 cells
        - "touches": Includes H3 cells that touch the AOI
        - "centroid": Includes H3 cells where the centroid falls within the AOI
        - "within": Includes H3 cells entirely within the AOI

fields : List[str]
    A list of field names to retrieve from the statistics table.

geometry : Optional["polygon", "point"]
    Specifies if the H3 geometries should be included in the response. It can be either "polygon" or "point". If None, geometries are not included

Returns
-------
DataFrame
    A DataFrame with the requested fields for each H3 cell that intersects with the AOIs.
File:      c:\users\wb514197\appdata\local\temp\ipykernel_33312\2584730817.py
Type:      function

Run API Calls

df = get_summary(
    gdf=adm_boundaries, 
    spatial_join_method="centroid", 
    fields=flood_vars,
    geometry="polygon"
)

pd.reset_option('display.max_colwidth')
df.head()

	shapeName	shapeID	shapeGroup	shapeType	index_h3	hex_id	geometry	pop	pop_flood	pop_flood_pct
0	Morobo	79223893B16328156709571	SSD	ADM2	0	866ae1067ffffff	{"type":"Polygon","coordinates":[[[30.62327290...	4204.4385	609.631552	0.144997
1	Morobo	79223893B16328156709571	SSD	ADM2	1	866ae106fffffff	{"type":"Polygon","coordinates":[[[30.68007919...	4776.9630	459.478346	0.096186
2	Morobo	79223893B16328156709571	SSD	ADM2	2	866ae114fffffff	{"type":"Polygon","coordinates":[[[30.61445415...	3752.4004	341.734756	0.091071
3	Morobo	79223893B16328156709571	SSD	ADM2	3	866ae116fffffff	{"type":"Polygon","coordinates":[[[30.60563958...	3304.6606	392.324178	0.118718
4	Morobo	79223893B16328156709571	SSD	ADM2	4	866ae132fffffff	{"type":"Polygon","coordinates":[[[30.84166572...	7512.3320	865.229318	0.115175

Check that there are no duplicate hexagon ids

df['hex_id'].duplicated().sum()

Convert geometry column from geojson into shapely polygons

df["geometry"] = df["geometry"].apply(lambda geom: from_geojson(geom))
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
gdf.head()

	shapeName	shapeID	shapeGroup	shapeType	index_h3	hex_id	geometry	pop	pop_flood	pop_flood_pct
0	Morobo	79223893B16328156709571	SSD	ADM2	0	866ae1067ffffff	POLYGON ((30.62327 3.62973, 30.63632 3.66096, ...	4204.4385	609.631552	0.144997
1	Morobo	79223893B16328156709571	SSD	ADM2	1	866ae106fffffff	POLYGON ((30.68008 3.60768, 30.69312 3.63891, ...	4776.9630	459.478346	0.096186
2	Morobo	79223893B16328156709571	SSD	ADM2	2	866ae114fffffff	POLYGON ((30.61445 3.68759, 30.6275 3.71879, 3...	3752.4004	341.734756	0.091071
3	Morobo	79223893B16328156709571	SSD	ADM2	3	866ae116fffffff	POLYGON ((30.60564 3.7454, 30.61869 3.77659, 3...	3304.6606	392.324178	0.118718
4	Morobo	79223893B16328156709571	SSD	ADM2	4	866ae132fffffff	POLYGON ((30.84167 3.59938, 30.85469 3.6306, 3...	7512.3320	865.229318	0.115175

Map hexagon data

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(ax=ax, column="pop_flood_pct", 
         legend=True, cmap="Reds", alpha=0.75, 
         scheme="equal_interval", k=5, 
         legend_kwds=dict(title='% Pop. Exposed', fmt="{:.0%}"),
         linewidth=0)
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.axis("off")
plt.show()

../_images/a5aac87b3bf41466ee66e245a70bf389e81dd7904a0fc102d8e8b04bc4ce2ba5.png

Extract Admin Summaries#

adm_boundaries_zs = get_aggregate(
    gdf=adm_boundaries, 
    spatial_join_method="centroid", 
    fields=['pop', 'pop_flood'], 
    aggregation_type="sum"
)

adm_boundaries_zs.head()

	shapeName	shapeID	shapeGroup	shapeType	geometry	pop	pop_flood
0	Morobo	79223893B16328156709571	SSD	ADM2	POLYGON ((30.56014 3.80448, 30.56025 3.80123, ...	163323.45	18611.677807
1	Kajo-keji	79223893B65724383376583	SSD	ADM2	POLYGON ((31.21043 4.02022, 31.09764 3.97353, ...	323397.56	34155.373103
2	Lainya	79223893B40788586970477	SSD	ADM2	POLYGON ((31.09684 3.73816, 31.097 3.738, 31.0...	152762.36	17860.699796
3	Yei	79223893B15931514297219	SSD	ADM2	POLYGON ((30.92897 3.92614, 30.92401 3.95754, ...	338000.16	44152.720881
4	Juba	79223893B61077901252755	SSD	ADM2	POLYGON ((31.72803 3.99162, 31.72429 4.00022, ...	641935.60	99231.778778

Recalculate share of population exposed with aggregate data

adm_boundaries_zs.loc[:, "pop_flood_pct"] = adm_boundaries_zs["pop_flood"] / adm_boundaries_zs["pop"]

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
adm_boundaries_zs.plot(
    ax=ax, column="pop_flood_pct", legend=True, 
    cmap="Reds", scheme="natural_breaks", 
    k=5, legend_kwds=dict(title='% Pop. Exposed', fmt="{:.0%}"),
    linewidth=0.2, edgecolor='black')
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldPhysical, crs='EPSG:4326')
plt.axis("off")
plt.show()

../_images/07c549ba22b70a62c4de3a97391dd39ff3b672fc800593b60df8e2a20efe6fe6.png

List top 20 provinces by population exposed

table = adm_boundaries_zs.sort_values('pop_flood_pct', ascending=False).head(20)[['shapeName', 'pop_flood', 'pop_flood_pct']].rename(
    columns={
        'shapeName': 'Province'
        })
table.loc[:, "Population Exposed"] = table.loc[:, "pop_flood"].apply(lambda x: f"{x:,.0f}")
table.loc[:, "Population Exposed (%)"] = table.loc[:, "pop_flood_pct"].apply(lambda x: f"{x:.2%}")
table.reset_index(drop=True, inplace=True)
display(table[['Province', 'Population Exposed', 'Population Exposed (%)']])

	Province	Population Exposed	Population Exposed (%)
0	Guit	39,438	68.05%
1	Ulang	91,703	65.99%
2	Leer	54,570	63.61%
3	Malakal	103,890	61.19%
4	Mayom	110,430	56.28%
5	Mayendit	50,026	54.05%
6	Gogrial West	152,568	53.71%
7	Panyijiar	46,028	53.33%
8	Panyikang	43,152	53.03%
9	Melut	40,606	52.15%
10	Fangak	87,145	51.88%
11	Fashoda	34,966	51.53%
12	Manyo	33,179	48.29%
13	Koch	57,514	47.25%
14	Twic	110,514	46.91%
15	Baliet	40,963	45.50%
16	Rumbek North	25,182	44.86%
17	Gogrial East	51,352	41.18%
18	Luakpiny/Nasir	133,452	41.08%
19	Aweil South	30,248	40.88%