Indexing The World Bank Catalog¶
Example notebook on how to index World Bank catalog
In [15]:
Copied!
import json
from datetime import datetime
from pathlib import Path
import contextily as cx
import pandas as pd
import requests
from bs4 import BeautifulSoup
from h3ronpy.arrow import cells_parse
from h3ronpy.pandas.vector import cells_dataframe_to_geodataframe
from worldex.datasets.worldbankcatalog import WorldBankCatalogDataset
import json
from datetime import datetime
from pathlib import Path
import contextily as cx
import pandas as pd
import requests
from bs4 import BeautifulSoup
from h3ronpy.arrow import cells_parse
from h3ronpy.pandas.vector import cells_dataframe_to_geodataframe
from worldex.datasets.worldbankcatalog import WorldBankCatalogDataset
Indexing a single catalog enttry¶
In [4]:
Copied!
body = { "SearchUrl":"https://itsdt-ddhext-search-prd.search.windows.net/indexes/ddh-dataset/docs?api-version=2020-06-30&search=*&$count=true&queryType=full&$top=500&$orderby=last_updated_date desc&$select=ClickCount,name,dataset_unique_id,identification,geographical_extent/coverage,lineage,Indicator,searchterms,data_quality/data_notes,last_updated_date,modified_on,constraints/security/classification,Resources&highlight=name,identification/title,identification/description,Indicators/title&$searchFields=Indicators/title,Indicators/description,identification/title,identification/description,searchterms,identification/acronym,geographical_extent/coverage/name,lineage/source_reference,high_priority_tags,priority_tags,Indicator,data_quality/data_notes,lineage/description,lineage/statistical_concept_and_methodology,lineage/study_type&highlightPreTag=<span class='search-chars'>&highlightPostTag=</span>&$filter=constraints/security/classification eq 'Public' and (Resources/any(res:res/format eq 'JSON') or Resources/any(res:res/format eq 'SHP') or Resources/any(res:res/format eq 'TIFF'))&$skip=50&facet=constraints/license/license_id,count:20000&facet=constraints/security/classification,count:20000&facet=identification/working_unit_vpu/name,count:20000&facet=Resources/format,count:20000&facet=temporal_resolution/periodicity,count:20000&facet=identification/language_supported/name,count:20000&facet=spatial_resolution/granularity,count:20000&facet=keywords/name,count:20000&facet=identification/collection_code,count:20000&facet=geographical_extent/coverage/name,count:20000&facet=lineage/harvest_system,count:20000&facet=identification/topics/name,count:20000&facet=keywords_list,count:20000","Body":"","KeyName":"AzureSearchApiKey","RequestType":"GET","HeaderInfo":[{"Key":"x-ms-azs-return-searchid","Value":"true"}]}
response = requests.post("https://datacatalogapi.worldbank.org/ddhxext/SearchData", json=body)
values = response.json()['Response']['value']
value = values[0]
value
body = { "SearchUrl":"https://itsdt-ddhext-search-prd.search.windows.net/indexes/ddh-dataset/docs?api-version=2020-06-30&search=*&$count=true&queryType=full&$top=500&$orderby=last_updated_date desc&$select=ClickCount,name,dataset_unique_id,identification,geographical_extent/coverage,lineage,Indicator,searchterms,data_quality/data_notes,last_updated_date,modified_on,constraints/security/classification,Resources&highlight=name,identification/title,identification/description,Indicators/title&$searchFields=Indicators/title,Indicators/description,identification/title,identification/description,searchterms,identification/acronym,geographical_extent/coverage/name,lineage/source_reference,high_priority_tags,priority_tags,Indicator,data_quality/data_notes,lineage/description,lineage/statistical_concept_and_methodology,lineage/study_type&highlightPreTag=&highlightPostTag=&$filter=constraints/security/classification eq 'Public' and (Resources/any(res:res/format eq 'JSON') or Resources/any(res:res/format eq 'SHP') or Resources/any(res:res/format eq 'TIFF'))&$skip=50&facet=constraints/license/license_id,count:20000&facet=constraints/security/classification,count:20000&facet=identification/working_unit_vpu/name,count:20000&facet=Resources/format,count:20000&facet=temporal_resolution/periodicity,count:20000&facet=identification/language_supported/name,count:20000&facet=spatial_resolution/granularity,count:20000&facet=keywords/name,count:20000&facet=identification/collection_code,count:20000&facet=geographical_extent/coverage/name,count:20000&facet=lineage/harvest_system,count:20000&facet=identification/topics/name,count:20000&facet=keywords_list,count:20000","Body":"","KeyName":"AzureSearchApiKey","RequestType":"GET","HeaderInfo":[{"Key":"x-ms-azs-return-searchid","Value":"true"}]}
response = requests.post("https://datacatalogapi.worldbank.org/ddhxext/SearchData", json=body)
values = response.json()['Response']['value']
value = values[0]
value
Out[4]:
{'@search.score': 1.0, 'name': 'Burkina Faso - Vegetation trend (human-induced)', 'dataset_unique_id': '0037827', 'ClickCount': None, 'last_updated_date': '2021-02-03T12:08:19Z', 'searchterms': [], 'Indicator': [], 'modified_on': '2023-01-18T21:50:31Z', 'identification': {'id': '4278c167-bfc7-eb11-bacc-000d3a5a1c19', 'title': 'Burkina Faso - Vegetation trend (human-induced)', 'subtitle': None, 'description': '<p>Human induced vegetation trend in Burkina Faso at 250m resolution from 2000-2017</p>\n', 'acronym': None, 'wb_project_reference': None, 'grant_number': None, 'collection_id': ['Earth Observation for Sustainable Development'], 'translated_title': None, 'citation': None, 'collection_code': ['EARTH_OBSERVATION_SUSTAINABLE'], 'doi_url': None, 'topics': [{'id': '4378c167-bfc7-eb11-bacc-000d3a5a1c19', 'name': 'Agriculture and Food Security'}], 'practice': None, 'work_unit': {'code': '0000004338', 'name': 'Development Committee'}, 'working_unit_vpu': {'code': '0000004338', 'name': 'DEC'}, 'parent_dataset': None, 'point_of_contact': [], 'dates': [{'id': '4678c167-bfc7-eb11-bacc-000d3a5a1c19', 'date': '3/1/2018 12:00:00 AM', 'type': 'RELEASE_DATE'}, {'id': '4778c167-bfc7-eb11-bacc-000d3a5a1c19', 'date': '2/3/2021 12:08:19 PM', 'type': 'LAST_UPDATED_DATE'}, {'id': '4878c167-bfc7-eb11-bacc-000d3a5a1c19', 'date': '8/27/2019 7:02:18 PM', 'type': 'CREATED_DATE'}], 'language_supported': [{'id': '4978c167-bfc7-eb11-bacc-000d3a5a1c19', 'code': 'EN', 'name': 'English'}]}, 'constraints': {'security': {'classification': 'Public'}}, 'data_quality': {'data_notes': None}, 'geographical_extent': {'coverage': [{'id': '4d78c167-bfc7-eb11-bacc-000d3a5a1c19', 'code': 'BF', 'name': 'Burkina Faso'}]}, 'lineage': {'id': '5178c167-bfc7-eb11-bacc-000d3a5a1c19', 'source_type': None, 'source': None, 'source_reference': None, 'statistical_concept_and_methodology': None, 'base_period': None, 'aggregation_method': None, 'description': None, 'related_links_and_publications': [], 'publication_place': None, 'funding_name_abbreviation_role': [], 'study_type': None, 'deviations_from_sample_design': None, 'other_acknowledgements': None, 'harvest_system': 'Others', 'harvest_system_reference': None, 'source_type_list': []}, 'Resources': [{'resource_id': '6584be6d-bfc7-eb11-bacc-000d3a5a1c19', 'resource_unique_id': 'DR0045795', 'name': 'Burkina Faso - Veget. trend (human-induced)', 'description': None, 'resource_type': 'Download', 'url': 'https://datacatalogfiles.worldbank.org/ddh-published/0037827/DR0045795/veget-trend-human-induced.tiff?versionId=2023-01-18T21:50:17.0622884Z', 'website_url': None, 'format': 'TIFF', 'harvest_source': 'https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/395626/veget-trend-human-induced.tiff', 'harvest_system_reference': 'https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/395626/veget-trend-human-induced.tiff', 'harvest_system': None, 'distribution_size': '1.0 MB'}, {'resource_id': '8684be6d-bfc7-eb11-bacc-000d3a5a1c19', 'resource_unique_id': 'DR0045796', 'name': 'EO4SD Agriculture Portal', 'description': None, 'resource_type': 'Landing page', 'url': None, 'website_url': 'https://eo4sd.lizard.net', 'format': 'HTML', 'harvest_source': 'https://eo4sd.lizard.net', 'harvest_system_reference': 'https://eo4sd.lizard.net', 'harvest_system': None, 'distribution_size': None}]}
In [6]:
Copied!
# Create dir to save files
path = Path(f"../data/worldbankcatalog/{value['name']}")
path.mkdir(parents=True, exist_ok=True)
# Create dir to save files
path = Path(f"../data/worldbankcatalog/{value['name']}")
path.mkdir(parents=True, exist_ok=True)
In [7]:
Copied!
# Parse information
description = BeautifulSoup(value['identification']["description"], 'html.parser').get_text().strip()
formats = [r['format'] for r in value['Resources'] if r['format']]
format = None
if 'TIFF' in formats:
format = 'GeoTiff'
elif 'JSON' in formats:
format = 'GeoJSON'
elif 'SHP' in formats:
format = 'SHP'
# Parse information
description = BeautifulSoup(value['identification']["description"], 'html.parser').get_text().strip()
formats = [r['format'] for r in value['Resources'] if r['format']]
format = None
if 'TIFF' in formats:
format = 'GeoTiff'
elif 'JSON' in formats:
format = 'GeoJSON'
elif 'SHP' in formats:
format = 'SHP'
In [8]:
Copied!
dataset = WorldBankCatalogDataset(
name=value["name"],
last_fetched=datetime.now().isoformat(),
files=[r['url'] or r['website_url']for r in value['Resources'] if r['url'] or r['website_url']],
data_format=format,
description=description,
projection="EPSG:4326",
keywords=[],
date_start=None,
date_end=None,
accessibility="public/open",
url=f"https://datacatalog.worldbank.org/search/dataset/{value['dataset_unique_id']}",
)
dataset.set_dir(path)
dataset.index()
dataset = WorldBankCatalogDataset(
name=value["name"],
last_fetched=datetime.now().isoformat(),
files=[r['url'] or r['website_url']for r in value['Resources'] if r['url'] or r['website_url']],
data_format=format,
description=description,
projection="EPSG:4326",
keywords=[],
date_start=None,
date_end=None,
accessibility="public/open",
url=f"https://datacatalog.worldbank.org/search/dataset/{value['dataset_unique_id']}",
)
dataset.set_dir(path)
dataset.index()
Out[8]:
h3_index | |
---|---|
0 | 88544d849bfffff |
1 | 88544d84d3fffff |
2 | 88544d84dbfffff |
3 | 88544d8609fffff |
4 | 88544d860bfffff |
... | ... |
300403 | 887536db65fffff |
300404 | 887536db67fffff |
300405 | 887536db69fffff |
300406 | 887536db6bfffff |
300407 | 887536db6dfffff |
300408 rows × 1 columns
In [18]:
Copied!
with open(path / "metadata.json") as f:
metadata = json.load(f)
metadata
with open(path / "metadata.json") as f:
metadata = json.load(f)
metadata
Out[18]:
{'id': 'bf5f5818-4d47-407e-9d6d-bcdfb714b0cf', 'name': 'Burkina Faso - Vegetation trend (human-induced)', 'source_org': 'Worldbank', 'last_fetched': '2024-03-22T09:13:11.037307', 'files': ['https://datacatalogfiles.worldbank.org/ddh-published/0037827/DR0045795/veget-trend-human-induced.tiff?versionId=2023-01-18T21:50:17.0622884Z', 'https://eo4sd.lizard.net'], 'description': 'Human induced vegetation trend in Burkina Faso at 250m resolution from 2000-2017', 'data_format': 'GeoTiff', 'projection': 'EPSG:4326', 'properties': None, 'bbox': 'POLYGON ((2.3918511246320033 9.3899808000811298, 2.3918511246320033 15.0810143679848832, -5.5225933478592450 15.0810143679848832, -5.5225933478592450 9.3899808000811298, 2.3918511246320033 9.3899808000811298))', 'keywords': [], 'date_start': None, 'date_end': None, 'accessibility': 'public/open', 'url': 'https://datacatalog.worldbank.org/search/dataset/0037827'}
In [17]:
Copied!
h3 = pd.read_parquet(path / "h3-compact.parquet")
h3_gdf = cells_dataframe_to_geodataframe(
pd.DataFrame({"cell": cells_parse(h3.h3_index)})
)
h3_gdf_reprojected = h3_gdf.to_crs(epsg=3857)
ax = h3_gdf_reprojected.plot(figsize=(10, 10), alpha=0.5, edgecolor="k")
cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)
h3 = pd.read_parquet(path / "h3-compact.parquet")
h3_gdf = cells_dataframe_to_geodataframe(
pd.DataFrame({"cell": cells_parse(h3.h3_index)})
)
h3_gdf_reprojected = h3_gdf.to_crs(epsg=3857)
ax = h3_gdf_reprojected.plot(figsize=(10, 10), alpha=0.5, edgecolor="k")
cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)