Skip to content

Usage

Creating a ZonalStats object

ZonalStats() is the main class to request temporal and zonal statistics using the GEE backend. The object can be initialized with parameters specifying data inputs and the type of aggregation.

gee_zonal.zonalstats.ZonalStats

Bases: object

Python class to calculate zonal and temporal statistics from Earth Engine datasets (ee.ImageCollection or ee.Image) over vector shapes (ee.FeatureCollections).

Parameters:

Name Type Description Default
target_features ee.FeatureCollection | gpd.GeoDataFrame | str (path to a shapefile/GeoJSON)

vector features

required
statistic_type str (mean, max, median, min, sum, stddev, var, count, minmax, p75, p25, p95, all)

method to aggregate image pixels by zone

required
collection_id str

ID for Earth Engine dataset

None
ee_dataset ee.Image | ee.ImageCollection

input dataset if no collection ID is provided

None
band str

name of image band to use

None
output_name str

file name for output statistics if saved to Google Drive

None
output_dir str

directory name for output statistics if saved to Google Drive

None
frequency str (monthly | annual | original)

temporal frequency for aggregation

'original'
temporal_stat str (mean, max, median, min, sum)

statistic for temporal aggregation

None
scale int

scale for calculation in mts

250
min_threshold int

filter out values lower than threshold

None
mask ee.Image

filter out observations where mask is zero

None
tile_scale int

tile scale factor for parallel processing

1
start_year int

specify start year for statistics

None
end_year int

specify end year for statistics

None
scale_factor int

scale factor to multiply ee.Image to get correct units

None
mapped bool

Boolean to indicate whether to use mapped or non-mapped version of zonal stats

False
Source code in gee_zonal/zonalstats.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
class ZonalStats(object):
    """
    Python class to calculate zonal and temporal statistics from Earth Engine datasets (ee.ImageCollection or ee.Image) over vector shapes (ee.FeatureCollections).

    :param target_features: vector features
    :type target_features: ee.FeatureCollection or gpd.GeoDataFrame or str (path to a shapefile/GeoJSON)
    :param statistic_type: method to aggregate image pixels by zone
    :type statistic_type: str (mean, max, median, min, sum, stddev, var, count, minmax, p75, p25, p95, all)
    :param collection_id: ID for Earth Engine dataset
    :type collection_id: str
    :param ee_dataset: input dataset if no collection ID is provided
    :type ee_dataset: ee.Image or ee.ImageCollection
    :param band: name of image band to use
    :type band: str
    :param output_name: file name for output statistics if saved to Google Drive
    :type output_name: str
    :param output_dir: directory name for output statistics if saved to Google Drive
    :type output_dir: str
    :param frequency: temporal frequency for aggregation
    :type frequency: str (monthly or annual or original)
    :param temporal_stat: statistic for temporal aggregation
    :type temporal_stat: str (mean, max, median, min, sum)
    :param scale: scale for calculation in mts
    :type scale: int
    :param min_threshold: filter out values lower than threshold
    :type min_threshold: int
    :param mask: filter out observations where mask is zero
    :type mask: ee.Image
    :param tile_scale: tile scale factor for parallel processing
    :type tile_scale: int
    :param start_year: specify start year for statistics
    :type start_year: int
    :param end_year: specify end year for statistics
    :type end_year: int
    :param scale_factor: scale factor to multiply ee.Image to get correct units
    :type scale_factor: int
    :param mapped: Boolean to indicate whether to use mapped or non-mapped version of zonal stats
    :type mapped: bool
    """

    def __init__(
        self,
        target_features,
        statistic_type,
        collection_id=None,
        ee_dataset=None,
        band=None,
        output_name=None,
        output_dir=None,
        frequency="original",
        temporal_stat=None,
        scale=250,
        min_threshold=None,
        mask=None,
        tile_scale=1,
        start_year=None,
        end_year=None,
        scale_factor=None,
        mapped=False,
    ):
        self.collection_id = collection_id
        if collection_id is None and ee_dataset is None:
            raise Exception("One of collection_id or ee_dataset must be supplied")
        self.collection_suffix = (
            collection_id[collection_id.rfind("/") + 1 :] if collection_id else None
        )
        if ee_dataset is None:
            try:
                ee.ImageCollection(collection_id).getInfo()
                self.ee_dataset = (
                    ee.ImageCollection(collection_id)
                    if band is None
                    else ee.ImageCollection(collection_id).select(band)
                )
            except Exception:
                try:
                    ee.Image(collection_id).getInfo()
                    self.ee_dataset = (
                        ee.Image(collection_id)
                        if band is None
                        else ee.Image(collection_id).select(band)
                    )
                except Exception:
                    raise Exception("Collection ID does not exist")
        else:
            self.ee_dataset = ee_dataset
        cat = Catalog()
        self.metadata = (
            cat.datasets.loc[cat.datasets.id == collection_id].iloc[0]
            if collection_id
            else None
        )
        self.target_features = (
            target_features
            if type(target_features) is ee.FeatureCollection
            else gpd_to_gee(target_features)
        )
        self.statistic_type = statistic_type
        self.frequency = frequency
        self.temporal_stat = temporal_stat
        self.output_dir = output_dir
        self.output_name = output_name
        self.task = None
        self.scale = scale
        self.min_threshold = min_threshold
        self.mask = mask
        self.scale_factor = scale_factor
        self.tile_scale = tile_scale
        self.start_year = start_year
        self.end_year = end_year
        self.mapped = mapped

    def yList(self, start=None, end=None):
        """
        Create list of years from a given dataset
        """
        if start is None:
            years = list(range(self.metadata.startyear, self.metadata.endyear + 1, 1))
        else:
            years = list(range(start, end + 1, 1))
        return ee.List(years)

    def ymList(self, start=None, end=None):
        """
        Create list of year/month pairs from a given dataset
        """
        if start is None and end is None:
            start = self.metadata.start_date
            end = self.metadata.end_date
            ym_range = pd.date_range(
                datetime(start.year, start.month, 1),
                datetime(end.year, end.month, 1),
                freq="MS",
            )
            ym_range = list(date.strftime("%Y%m") for date in ym_range)
        else:
            ym_range = pd.date_range(
                datetime(start, 1, 1), datetime(end, 12, 31), freq="MS"
            )
            ym_range = list(date.strftime("%Y%m") for date in ym_range)
        return ee.List(ym_range)

    def ymList_ee(self):
        """
        Create list of year/month pairs from a given dataset using EE
        """

        def iter_func(image, newlist):
            date = ee.Number.parse(image.date().format("YYYYMM")).format()
            newlist = ee.List(newlist)
            return ee.List(newlist.add(date).sort())

        ymd = self.ee_dataset.iterate(iter_func, ee.List([]))
        return ee.List(ymd).distinct()

    def temporalStack(self, date_list, freq, stat):
        allowed_statistics_ts = {
            "mean": ee.Reducer.mean(),
            "max": ee.Reducer.max(),
            "median": ee.Reducer.median(),
            "min": ee.Reducer.min(),
            "sum": ee.Reducer.sum(),
            "stddev": ee.Reducer.stdDev(),
        }
        if stat not in allowed_statistics_ts.keys():
            raise Exception(
                "temporal statistic must be one of be one of {}".format(
                    ", ".join(list(allowed_statistics_ts.keys()))
                )
            )

        def aggregate_monthly(ym):
            date = ee.Date.parse("YYYYMM", ym)
            y = date.get("year")
            m = date.get("month")
            monthly = (
                self.ee_dataset.filter(ee.Filter.calendarRange(y, y, "year"))
                .filter(ee.Filter.calendarRange(m, m, "month"))
                .reduce(allowed_statistics_ts[stat])
                .set("month", m)
                .set("year", y)
                .set("system:index", ee.String(y.format().cat("_").cat(m.format())))
            )
            return monthly

        def aggregate_annual(y):
            y = ee.Number(y)
            annual = (
                self.ee_dataset.filter(ee.Filter.calendarRange(y, y, "year"))
                .reduce(allowed_statistics_ts[stat])
                .set("year", y)
                .set("system:index", ee.String(y.format()))
            )
            return annual

        if freq == "monthly":
            byTime = ee.ImageCollection.fromImages(date_list.map(aggregate_monthly))
        if freq == "annual":
            byTime = ee.ImageCollection.fromImages(date_list.map(aggregate_annual))
        return byTime  # .toBands()

    def applyWaterMask(self, image, year=None):
        land_mask = (
            ee.Image("MODIS/MOD44W/MOD44W_005_2000_02_24").select("water_mask").eq(0)
        )
        return image.updateMask(land_mask)

    def applyMinThreshold(self, image, min_threshold):
        bool_mask = image.gte(min_threshold)
        return image.updateMask(bool_mask)

    def applyMask(self, image, mask):
        return image.updateMask(mask)

    def applyScaleFactor(self, image, scale_factor):
        return image.multiply(scale_factor)

    def runZonalStats(self):
        """
        Run zonal statistics aggregation

        :return: tabular statistics
        :rtype: DataFrame or dict with EE task status if output_name/dir is specified
        """
        if self.frequency not in ["monthly", "annual", "original"]:
            raise Exception("frequency must be one of annual, monthly, or original")
        if self.frequency == "monthly":
            timesteps = self.ymList(self.start_year, self.end_year)
        elif self.frequency == "annual":
            timesteps = self.yList(self.start_year, self.end_year)
        elif self.frequency == "original":
            if self.start_year is not None and self.end_year is not None:
                start_year_format = datetime(self.start_year, 1, 1).strftime("%Y-%m-%d")
                end_year_format = datetime(self.end_year, 12, 31).strftime("%Y-%m-%d")
                self.ee_dataset = self.ee_dataset.filterDate(
                    start_year_format, end_year_format
                )
        # byTimesteps = self.ee_dataset.toBands() if self.frequency=="original" else self.temporalStack(timesteps, self.frequency, self.temporal_stat)
        if self.frequency == "original":
            if type(self.ee_dataset) is ee.image.Image:
                byTimesteps = self.ee_dataset
            elif type(self.ee_dataset) is ee.imagecollection.ImageCollection:
                byTimesteps = self.ee_dataset.toBands()
        else:
            byTimesteps = self.temporalStack(
                timesteps, self.frequency, self.temporal_stat
            )
            byTimesteps = byTimesteps.toBands()

        # pre-processing
        if self.mask is not None:
            if self.mask == "water":
                byTimesteps = self.applyWaterMask(byTimesteps)
            elif type(self.mask) is ee.image.Image:
                byTimesteps = self.applyMask(byTimesteps, self.mask)
        if self.min_threshold is not None:
            byTimesteps = self.applyMinThreshold(byTimesteps, self.min_threshold)
        if self.scale_factor is not None:
            byTimesteps = self.applyScaleFactor(byTimesteps, self.scale_factor)

        allowed_statistics = {
            "count": ee.Reducer.frequencyHistogram().unweighted(),
            "mean": ee.Reducer.mean(),
            "max": ee.Reducer.max(),
            "median": ee.Reducer.median(),
            "min": ee.Reducer.min(),
            "sum": ee.Reducer.sum(),
            "stddev": ee.Reducer.stdDev(),
            "var": ee.Reducer.variance(),
            "minmax": ee.Reducer.minMax(),
            "p75": ee.Reducer.percentile(
                [75]
            ),  # maxBuckets=10 , minBucketWidth=1, maxRaw=1000
            "p25": ee.Reducer.percentile(
                [25]
            ),  # maxBuckets=10 , minBucketWidth=1, maxRaw=1000
            "p95": ee.Reducer.percentile(
                [95]
            ),  # maxBuckets=10 , minBucketWidth=1, maxRaw=1000
            "all": ee.Reducer.mean()
            .combine(ee.Reducer.minMax(), sharedInputs=True)
            .combine(ee.Reducer.stdDev(), sharedInputs=True),
        }

        def combine_reducers(reducer_list):
            for i, r in enumerate(reducer_list):
                if i == 0:
                    reducer = r
                if i > 0:
                    reducer = reducer.combine(r, sharedInputs=True)
            return reducer

        if type(self.statistic_type) is str:
            if self.statistic_type not in allowed_statistics.keys():
                raise Exception(
                    "statistic must be one of be one of {}".format(
                        ", ".join(list(allowed_statistics.keys()))
                    )
                )
            else:
                reducer = allowed_statistics[self.statistic_type]
        elif type(self.statistic_type) is list:
            for stat_type in self.statistic_type:
                if stat_type not in allowed_statistics.keys():
                    raise Exception(
                        "statistic must be one of be one of {}".format(
                            ", ".join(list(allowed_statistics.keys()))
                        )
                    )
            reducer_list = [
                allowed_statistics[stat_type] for stat_type in self.statistic_type
            ]
            reducer = combine_reducers(reducer_list)

        if self.mapped:

            def zs_func(feature):
                zs_result = byTimesteps.reduceRegion(
                    reducer=reducer,
                    geometry=feature.geometry(),
                    scale=self.scale,
                    maxPixels=10e15,  # 1e13
                    tileScale=self.tile_scale,
                )
                feature = feature.set(zs_result)
                return feature

            zs = self.target_features.map(zs_func)  # .getInfo()
        else:
            zs = ee.Image(byTimesteps).reduceRegions(
                collection=self.target_features,
                reducer=reducer,
                scale=self.scale,
                tileScale=self.tile_scale,
            )
        if self.output_dir is not None and self.output_name is not None:
            self.task = ee.batch.Export.table.toDrive(
                collection=zs,
                description=f"Zonal statistics for {self.collection_suffix}",
                fileFormat="CSV",
                folder=self.output_dir,
                fileNamePrefix=self.output_name,
            )
            self.task.start()
            # return(self)
        else:
            res = zs.getInfo()
            return self.get_zonal_res(res)

    def get_zonal_res(self, res, rename=None):
        """
        Create a data frame from the results of GEE zonal
        :param res: response from RunZonalStats method retrieved via featureCollection.getInfo()
        :type res: dictionary from ee.FeatureCollection
        """
        feats = res["features"]
        ids = [f["id"] for f in feats]
        series = [pd.Series(f["properties"]) for f in feats]
        df = pd.DataFrame(data=series, index=ids)
        if rename:
            df.rename(columns=rename, inplace=True)
        return df

    def reportRunTime(self):
        start_time = self.task.status()["start_timestamp_ms"]
        update_time = self.task.status()["update_timestamp_ms"]
        if self.task.status()["state"] == "RUNNING":
            delta = datetime.now() - datetime.fromtimestamp(start_time / 1000)
            print("Still running")
            print(
                f"Runtime: {delta.seconds//60} minutes and {delta.seconds % 60} seconds"
            )
        if self.task.status()["state"] == "COMPLETED":
            delta = datetime.fromtimestamp(update_time / 1000) - datetime.fromtimestamp(
                start_time / 1000
            )
            print("Completed")
            print(
                f"Runtime: {delta.seconds//60} minutes and {delta.seconds % 60} seconds"
            )
        if self.task.status()["state"] == "FAILED":
            print("Failed!")
            print(self.task.status()["error_message"])
        if self.task.status()["state"] == "READY":
            print("Status is Ready, hasn't started")

    def getZonalStats(self, drive):
        folder = drive.ListFile(
            {
                "q": f"title = '{self.output_dir}' and trashed=false and mimeType = 'application/vnd.google-apps.folder'"
            }
        ).GetList()[0]
        folder_id = folder["id"]
        export_file = drive.ListFile(
            {
                "q": f"'{folder_id}' in parents and trashed=false and title contains '{self.output_name}'"
            }
        ).GetList()[0]
        s = export_file.GetContentString()
        c = pd.read_csv(io.StringIO(s))
        c.drop([".geo", "system:index"], axis=1, inplace=True)
        return c

Input target features can be referenced directly as a GEE asset, or can be supplied as a geopandas.GeoDataFrame, or a path to a shapefile/GeoJSON (will be automatically converted to ee.FeatureCollection).

gee_zonal.zonalstats.ZonalStats.runZonalStats()

Run zonal statistics aggregation

Returns:

Type Description
DataFrame | dict with EE task status if output_name/dir is specified

tabular statistics

Source code in gee_zonal/zonalstats.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
def runZonalStats(self):
    """
    Run zonal statistics aggregation

    :return: tabular statistics
    :rtype: DataFrame or dict with EE task status if output_name/dir is specified
    """
    if self.frequency not in ["monthly", "annual", "original"]:
        raise Exception("frequency must be one of annual, monthly, or original")
    if self.frequency == "monthly":
        timesteps = self.ymList(self.start_year, self.end_year)
    elif self.frequency == "annual":
        timesteps = self.yList(self.start_year, self.end_year)
    elif self.frequency == "original":
        if self.start_year is not None and self.end_year is not None:
            start_year_format = datetime(self.start_year, 1, 1).strftime("%Y-%m-%d")
            end_year_format = datetime(self.end_year, 12, 31).strftime("%Y-%m-%d")
            self.ee_dataset = self.ee_dataset.filterDate(
                start_year_format, end_year_format
            )
    # byTimesteps = self.ee_dataset.toBands() if self.frequency=="original" else self.temporalStack(timesteps, self.frequency, self.temporal_stat)
    if self.frequency == "original":
        if type(self.ee_dataset) is ee.image.Image:
            byTimesteps = self.ee_dataset
        elif type(self.ee_dataset) is ee.imagecollection.ImageCollection:
            byTimesteps = self.ee_dataset.toBands()
    else:
        byTimesteps = self.temporalStack(
            timesteps, self.frequency, self.temporal_stat
        )
        byTimesteps = byTimesteps.toBands()

    # pre-processing
    if self.mask is not None:
        if self.mask == "water":
            byTimesteps = self.applyWaterMask(byTimesteps)
        elif type(self.mask) is ee.image.Image:
            byTimesteps = self.applyMask(byTimesteps, self.mask)
    if self.min_threshold is not None:
        byTimesteps = self.applyMinThreshold(byTimesteps, self.min_threshold)
    if self.scale_factor is not None:
        byTimesteps = self.applyScaleFactor(byTimesteps, self.scale_factor)

    allowed_statistics = {
        "count": ee.Reducer.frequencyHistogram().unweighted(),
        "mean": ee.Reducer.mean(),
        "max": ee.Reducer.max(),
        "median": ee.Reducer.median(),
        "min": ee.Reducer.min(),
        "sum": ee.Reducer.sum(),
        "stddev": ee.Reducer.stdDev(),
        "var": ee.Reducer.variance(),
        "minmax": ee.Reducer.minMax(),
        "p75": ee.Reducer.percentile(
            [75]
        ),  # maxBuckets=10 , minBucketWidth=1, maxRaw=1000
        "p25": ee.Reducer.percentile(
            [25]
        ),  # maxBuckets=10 , minBucketWidth=1, maxRaw=1000
        "p95": ee.Reducer.percentile(
            [95]
        ),  # maxBuckets=10 , minBucketWidth=1, maxRaw=1000
        "all": ee.Reducer.mean()
        .combine(ee.Reducer.minMax(), sharedInputs=True)
        .combine(ee.Reducer.stdDev(), sharedInputs=True),
    }

    def combine_reducers(reducer_list):
        for i, r in enumerate(reducer_list):
            if i == 0:
                reducer = r
            if i > 0:
                reducer = reducer.combine(r, sharedInputs=True)
        return reducer

    if type(self.statistic_type) is str:
        if self.statistic_type not in allowed_statistics.keys():
            raise Exception(
                "statistic must be one of be one of {}".format(
                    ", ".join(list(allowed_statistics.keys()))
                )
            )
        else:
            reducer = allowed_statistics[self.statistic_type]
    elif type(self.statistic_type) is list:
        for stat_type in self.statistic_type:
            if stat_type not in allowed_statistics.keys():
                raise Exception(
                    "statistic must be one of be one of {}".format(
                        ", ".join(list(allowed_statistics.keys()))
                    )
                )
        reducer_list = [
            allowed_statistics[stat_type] for stat_type in self.statistic_type
        ]
        reducer = combine_reducers(reducer_list)

    if self.mapped:

        def zs_func(feature):
            zs_result = byTimesteps.reduceRegion(
                reducer=reducer,
                geometry=feature.geometry(),
                scale=self.scale,
                maxPixels=10e15,  # 1e13
                tileScale=self.tile_scale,
            )
            feature = feature.set(zs_result)
            return feature

        zs = self.target_features.map(zs_func)  # .getInfo()
    else:
        zs = ee.Image(byTimesteps).reduceRegions(
            collection=self.target_features,
            reducer=reducer,
            scale=self.scale,
            tileScale=self.tile_scale,
        )
    if self.output_dir is not None and self.output_name is not None:
        self.task = ee.batch.Export.table.toDrive(
            collection=zs,
            description=f"Zonal statistics for {self.collection_suffix}",
            fileFormat="CSV",
            folder=self.output_dir,
            fileNamePrefix=self.output_name,
        )
        self.task.start()
        # return(self)
    else:
        res = zs.getInfo()
        return self.get_zonal_res(res)

Retrieving output table

  1. Retrieve output table directly

Statistics can be accessed as the result of ZonalStats.runZonalStats(). This will be computed within the python earth engine environment.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from gee_zonal import ZonalStats
AOIs = ee.FeatureCollection('users/afche18/Ethiopia_AOI') # ID of ee.FeatureCollection
zs = ZonalStats(
    collection_id = 'LANDSAT/LC08/C01/T1_8DAY_NDVI',
    target_features = AOIs,
    statistic_type = "all", # all includes min, max, mean, and stddev
    frequency = "annual",
    temporal_stat = "mean"
)
df = zs.runZonalStats()
df
  1. Submit an EE Task

Alternatively, a task can be submitted to the Earth Engine servers by specifying an output_name and outuput_dir.

This option is recommended to run statistics for big areas or for a high number of collections. The output table will be saved on the specified directory in Google Drive.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
import ee
from gee_tools import ZonalStats
zs = ZonalStats(
    collection_id='UCSB-CHG/CHIRPS/PENTAD',
    target_features=AOIs,
    statistic_type="mean",
    temporal_stat="sum",
    frequency="annual",
    scale=5000,
    output_dir = "gdrive_folder",
    output_name="pretty_output"
)
zs.runZonalStats()

The status of the task can be monitored with ZonalStats.reportRunTime()

1
2
3
zs.reportRunTime()
>>> Completed
>>> Runtime: 1 minutes and 31 seconds

Searching the EE catalog

The Earth Engine Data Catalog <https://developers.google.com/earth-engine/datasets> is an archive of public datasets available via Google Earth Engine. The Catalog() class provides a quick way to search for datasets by tags, title, and year / time period.

Initialize Catalog Object

The catalog object contains a datasets variable, a DataFrame containing a copy of the Earth Engine data catalog.

1
2
3
from gee_zonal import Catalog
cat = Catalog()
cat.datasets

Search functions

1
2
3
4
results = cat.search_tags("ndvi")
results = results.search_by_period(1985, 2021)
results = results.search_title("landsat")
print(results)

gee_zonal.catalog.Catalog

Bases: object

Inventory of Earth Engine public, saved as a DataFrame under datasets variable

Source code in gee_zonal/catalog.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class Catalog(object):
    """
    Inventory of Earth Engine public, saved as a DataFrame under datasets variable
    """

    def __init__(self, datasets=None, redownload=False):
        def load_datasets():
            if redownload:
                datasets = pd.read_csv(
                    "https://raw.githubusercontent.com/samapriya/Earth-Engine-Datasets-List/master/gee_catalog.csv"
                )
                datasets = datasets[
                    [
                        "id",
                        "provider",
                        "title",
                        "start_date",
                        "end_date",
                        "startyear",
                        "endyear",
                        "type",
                        "tags",
                        "asset_url",
                        "thumbnail_url",
                    ]
                ]
                datasets.to_csv(
                    os.path.join(repo_dir, "Earth-Engine-Datasets-List/eed_latest.csv"),
                    index=False,
                )
            else:
                try:
                    datasets = pd.read_csv(
                        "https://raw.githubusercontent.com/samapriya/Earth-Engine-Datasets-List/master/gee_catalog.csv"
                    )
                except Exception:
                    datasets = pd.read_csv(
                        os.path.join(
                            repo_dir, "Earth-Engine-Datasets-List/eed_latest.csv"
                        )
                    )
            datasets["tags"] = datasets.tags.apply(lambda x: x.lower())
            datasets["tags"] = datasets.tags.apply(lambda x: x.split(", "))
            datasets["start_date"] = pd.to_datetime(datasets.start_date)
            datasets["end_date"] = pd.to_datetime(datasets.end_date)
            return datasets

        self.datasets = load_datasets() if datasets is None else datasets

    def __str__(self):
        return self.datasets.title.to_string()

    def __len__(self):
        return len(self.datasets)

    def search_tags(self, keyword):
        """
        search for keyword in tags
        """
        keyword = keyword.lower()
        search_results = self.datasets.loc[
            self.datasets.tags.apply(lambda x: keyword in x)
        ]
        if len(search_results) > 0:
            return Catalog(search_results)
        else:
            raise Exception("No hits!")

    def search_title(self, keyword):
        """
        search for keyword in title
        """

        def search_function(title, keyword):
            match = re.search(keyword, title, flags=re.IGNORECASE)
            return True if match else False

        search_results = self.datasets.loc[
            self.datasets.title.apply(search_function, args=[keyword])
        ]
        if len(search_results) > 0:
            return Catalog(search_results)
        else:
            raise Exception("No hits!")

    def search_by_year(self, year):
        """
        get all datasets from a particular year:
            dataset start <= year <= dataset end
        """
        search_results = self.datasets.loc[
            (self.datasets.startyear <= year) & (self.datasets.endyear >= year)
        ]
        if len(search_results) > 0:
            return Catalog(search_results)
        else:
            raise Exception("No hits!")

    def search_by_period(self, start, end):
        """
        get all datasets that intersect a time period:
            start of dataset <= end year
            end of dataset >= start year
        """
        search_results = self.datasets.loc[
            (self.datasets.startyear <= end) & (self.datasets.endyear >= start)
        ]
        if len(search_results) > 0:
            return Catalog(search_results)
        else:
            raise Exception("No hits!")