# Copyright (c) 2020, TU Wien, Department of Geodesy and Geoinformation
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of TU Wien, Department of Geodesy and Geoinformation
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written
# permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL TU WIEN DEPARTMENT OF GEODESY AND
# GEOINFORMATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pynetcf.time_series as nc
import pygeogrids.grids as grids
import repurpose.resample as resamp
import numpy as np
import os
from datetime import datetime
import logging
import pygeogrids
import pygeogrids.netcdf as grid2nc
[docs]class Img2TsError(Exception):
pass
[docs]class Img2Ts(object):
"""
class that uses the read_img iterator of the input_data dataset
to read all images between startdate and enddate and saves them
in netCDF time series files according to the given netCDF class
and the cell structure of the outputgrid
Parameters
----------
input_dataset : DatasetImgBase like class instance
must implement a ``read(date, **input_kwargs)`` iterator that returns a
`pygeobase.object_base.Image`.
outputpath : string
path where to save the time series to
startdate : date
date from which the time series should start. Of course images
have to be available from this date onwards.
enddate : date
date when the time series should end. Images should be availabe
up until this date
input_kwargs : dict, optional
keyword arguments which should be used in the read_img method of the
input_dataset
input_grid : grid instance as defined in :module:`pytesmo.grids.grid`, optional
the grid on which input data is stored.
If not given then the grid of the input dataset will be used.
If the input dataset has no grid object then resampling to the
target_grid is performed.
target_grid : grid instance as defined in :module:`pytesmo.grids.grid`, optional
the grid on which the time series will be stored.
If not given then the grid of the input dataset will be used
imgbuffer : int, optional
number of days worth of images that should be read into memory before
a time series is written. This parameter should be chosen so that
the memory of your machine is utilized. It depends on the daily data
volume of the input dataset
variable_rename : dict, optional
if the variables should have other names than the names that are
returned as keys in the dict by the daily_images iterator. A dictionary
can be provided that changes these names for the time series.
unlim_chunksize : int, optional
netCDF chunksize for unlimited variables.
cellsize_lat : float, optional
if outgrid or input_data.grid are not cell grids then the cellsize
in latitude direction can be specified here. Default is 1 global cell.
cellsize_lon : float, optional
if outgrid or input_data.grid are not cell grids then the cellsize
in longitude direction can be specified here. Default is 1 global cell.
r_methods : string or dict, optional
resample methods to use if resampling is necessary, either 'nn' for nearest
neighbour or 'custom' for custom weight function. Can also be a dictionary
in which the method is specified for each variable
r_weightf : function or dict, optional
if r_methods is custom this function will be used to calculate the weights
depending on distance. This can also be a dict with a separate weight function
for each variable.
r_min_n : int, optional
Minimum number of neighbours on the target_grid that are required for a point to be resampled.
r_radius : float, optional
resample radius in which neighbours should be searched given in meters
r_neigh : int, optional
maximum number of neighbours found inside r_radius to use during resampling. If more are found
the r_neigh closest neighbours will be used.
r_fill_values : number or dict, optional
if given the resampled output array will be filled with this value if no valid
resampled value could be computed, if not a masked array will be returned
can also be a dict with a fill value for each variable
filename_templ : string, optional
filename template must be a string with a string formatter for the cell number.
e.g. '%04d.nc' will translate to the filename '0001.nc' for cell number 1.
gridname : string, optional
filename of the grid which will be saved as netCDF
global_attr : dict, optional
global attributes for each file
ts_attributes : dict, optional
dictionary of attributes that should be set for the netCDF time series.
Can be either a dictionary of attributes that will be set for all variables in input_data
or a dictionary of dictionaries. In the second case the first dictionary has to have a key
for each variable returned by input_data and the second level dictionary will be the dictionary of
attributes for this time series.
ts_dtype : numpy.dtype or dict of numpy.dtypes
data type to use for the time series, if it is a dict then a key must exist for each
variable returned by input_data.
Default : None, no change from input data
time_units : string, optional
units the time axis is given in.
Default: "days since 1858-11-17 00:00:00" which is modified julian date
for regular images this can be set freely since the conversion is done
automatically, for images with irregular timestamp this will be ignored for now
zlib: boolean, optional
if True the saved netCDF files will be compressed
Default: True
"""
def __init__(self, input_dataset, outputpath, startdate, enddate,
input_kwargs={}, input_grid=None, target_grid=None, imgbuffer=100, variable_rename=None,
unlim_chunksize=100, cellsize_lat=180.0, cellsize_lon=360.0,
r_methods='nn', r_weightf=None, r_min_n=1, r_radius=18000,
r_neigh=8, r_fill_values=None, filename_templ='%04d.nc',
gridname='grid.nc', global_attr=None, ts_attributes=None,
ts_dtypes=None, time_units="days since 1858-11-17 00:00:00", zlib=True):
self.imgin = input_dataset
self.zlib = zlib
if not hasattr(self.imgin, 'grid'):
self.input_grid = input_grid
else:
self.input_grid = self.imgin.grid
if self.input_grid is None and target_grid is None:
raise ValueError("Either the input dataset has to have a grid, "
"input_grid has to be specified or "
"target_grid has to be set")
self.input_kwargs = input_kwargs
self.target_grid = target_grid
if self.target_grid is None:
self.target_grid = self.input_grid
self.resample = False
else:
# if input and target grid are not equal resampling is required
if self.input_grid != self.target_grid:
self.resample = True
# if the target grid is not a cell grid make it one
# default is just one cell for the entire grid
if not isinstance(self.target_grid, grids.CellGrid):
self.target_grid = self.target_grid.to_cell_grid(cellsize_lat=cellsize_lat,
cellsize_lon=cellsize_lon)
self.currentdate = startdate
self.startdate = startdate
self.enddate = enddate
self.imgbuffer = imgbuffer
self.outputpath = outputpath
self.variable_rename = variable_rename
self.unlim_chunksize = unlim_chunksize
self.gridname = gridname
self.r_methods = r_methods
self.r_weightf = r_weightf
self.r_min_n = r_min_n
self.r_radius = r_radius
self.r_neigh = r_neigh
self.r_fill_values = r_fill_values
# if each image has only one timestamp then we are handling
# time series of type Orthogonal multidimensional array representation
# according to the CF conventions
self.orthogonal = None
self.filename_templ = filename_templ
self.global_attr = global_attr
self.ts_attributes = ts_attributes
self.ts_dtypes = ts_dtypes
self.time_units = time_units
self.non_ortho_time_units = "days since 1858-11-17 00:00:00"
[docs] def calc(self):
"""
go through all images and retrieve a stack of them
then go through all grid points in cell order and write to netCDF file
"""
# save grid information in file
grid2nc.save_grid(
os.path.join(self.outputpath, self.gridname), self.target_grid)
for img_stack_dict, start, end, dates, jd_stack in self.img_bulk():
#==================================================================
start_time = datetime.now()
for cell in self.target_grid.get_cells():
cell_gpis, cell_lons, cell_lats = self.target_grid.grid_points_for_cell(
cell)
# look where in the subset the data is
cell_index = np.where(
cell == self.target_grid.activearrcell)[0]
if cell_index.size == 0:
raise Img2TsError('cell not found in grid subset')
data = {}
for key in img_stack_dict:
# rename variable in output dataset
if self.variable_rename is None:
var_new_name = str(key)
else:
var_new_name = self.variable_rename[key]
output_array = np.swapaxes(
img_stack_dict[key][:, cell_index], 0, 1)
# change dtypes of output time series
if self.ts_dtypes is not None:
if type(self.ts_dtypes) == dict:
output_dtype = self.ts_dtypes[key]
else:
output_dtype = self.ts_dtypes
output_array = output_array.astype(output_dtype)
data[var_new_name] = output_array
if self.orthogonal:
with nc.OrthoMultiTs(
os.path.join(self.outputpath,
self.filename_templ % cell),
n_loc=cell_gpis.size, mode='a',
zlib=self.zlib,
unlim_chunksize=self.unlim_chunksize,
time_units=self.time_units) as dataout:
# add global attributes to file
if self.global_attr is not None:
for attr in self.global_attr:
dataout.add_global_attr(
attr, self.global_attr[attr])
dataout.add_global_attr(
'geospatial_lat_min', np.min(cell_lats))
dataout.add_global_attr(
'geospatial_lat_max', np.max(cell_lats))
dataout.add_global_attr(
'geospatial_lon_min', np.min(cell_lons))
dataout.add_global_attr(
'geospatial_lon_max', np.max(cell_lons))
dataout.write_all(cell_gpis, data, dates,
lons=cell_lons,
lats=cell_lats,
attributes=self.ts_attributes)
elif not self.orthogonal:
with nc.IndexedRaggedTs(os.path.join(self.outputpath,
self.filename_templ %
cell),
n_loc=cell_gpis.size, mode='a',
zlib=self.zlib,
unlim_chunksize=self.unlim_chunksize,
time_units=self.non_ortho_time_units) as dataout:
# add global attributes to file
if self.global_attr is not None:
for attr in self.global_attr:
dataout.add_global_attr(
attr, self.global_attr[attr])
dataout.add_global_attr(
'geospatial_lat_min', np.min(cell_lats))
dataout.add_global_attr(
'geospatial_lat_max', np.max(cell_lats))
dataout.add_global_attr(
'geospatial_lon_min', np.min(cell_lons))
dataout.add_global_attr(
'geospatial_lon_max', np.max(cell_lons))
# for this dataset we have to loop through the gpis since each time series
# can be different in length
for i, (gpi, gpi_lon, gpi_lat) in enumerate(zip(cell_gpis, cell_lons, cell_lats)):
gpi_data = {}
# convert to modified julian date
gpi_jd = jd_stack[:, cell_index[i]] - 2400000.5
# remove measurements that were filled with the fill value
# during resampling
# doing this on the basis of the time variable should
# be enought since without time -> no valid
# observations
if self.resample:
if self.r_fill_values is not None:
if type(self.r_fill_values) == dict:
time_fill_value = self.r_fill_values[
self.time_var]
else:
time_fill_value = self.r_fill_values
valid_mask = gpi_jd != time_fill_value
else:
valid_mask = np.invert(gpi_jd.mask)
gpi_jd = gpi_jd[valid_mask]
else:
# all are valid if no resampling took place
valid_mask = slice(None, None, None)
for key in data:
gpi_data[key] = data[key][i, valid_mask]
if gpi_jd.data.size > 0:
dataout.write_ts(gpi, gpi_data, gpi_jd,
lon=gpi_lon, lat=gpi_lat,
attributes=self.ts_attributes,
dates_direct=True)
data = {}
output_array = None
logging.log(logging.INFO, datetime.now() - start_time)
[docs] def img_bulk(self):
"""
Yields numpy array of self.const.imgbuffer images,
start and enddate until all dates have been read
Returns
-------
img_stack_dict : dict of numpy.array
stack of daily images for each variable
startdate : date
date of first image in stack
enddate : date
date of last image in stack
datetimestack : np.array
array of the timestamps of each image
jd_stack : np.array or None
if None all observations in an image have the same
observation timestamp. Otherwise it gives the julian date
of each observation in img_stack_dict
"""
img_dict = {}
datetimes = []
jd_list = []
# set start of current imgbulk to startdate
bulkstart = self.startdate
# image counter
read_images = 0
dates = self.imgin.tstamps_for_daterange(self.startdate,
self.enddate)
for date in dates:
try:
(input_img, metadata,
image_datetime, lon,
lat, time_arr) = self.imgin.read(date, **self.input_kwargs)
except IOError as e:
msg = "I/O error({0}): {1}".format(e.errno,
e.strerror)
logging.log(logging.INFO, msg)
continue
read_images += 1
logging.log(logging.INFO, "read" + image_datetime.isoformat())
if self.resample:
if time_arr is not None:
input_img['jd'] = time_arr
input_img = resamp.resample_to_grid(input_img, lon, lat,
self.target_grid.activearrlon,
self.target_grid.activearrlat,
methods=self.r_methods,
weight_funcs=self.r_weightf,
min_neighbours=self.r_min_n,
search_rad=self.r_radius,
neighbours=self.r_neigh,
fill_values=self.r_fill_values)
time_arr = input_img.pop('jd')
if time_arr is None:
self.time_var = None
else:
self.time_var = 'jd'
if time_arr is not None:
# if time_var is not None means that each observation of the
# image has its own observation time
# this means that the resulting time series is not
# regularly spaced in time
if self.orthogonal is None:
self.orthogonal = False
if self.orthogonal:
raise Img2TsError("Images can not switch between a fixed image "
"timestamp and individual timestamps for each observation")
jd_list.append(time_arr)
if time_arr is None:
if self.orthogonal is None:
self.orthogonal = True
if not self.orthogonal:
raise Img2TsError(
"Images can not switch between a fixed image "
"timestamp and individual timestamps for each observation")
for key in input_img:
if key not in img_dict.keys():
img_dict[key] = []
img_dict[key].append(input_img[key])
datetimes.append(image_datetime)
if read_images >= self.imgbuffer - 1:
img_stack_dict = {}
if len(jd_list) != 0:
jd_stack = np.ma.vstack(jd_list)
jd_list = None
else:
jd_stack = None
for key in img_dict:
img_stack_dict[key] = np.vstack(img_dict[key])
img_dict[key] = None
datetimestack = np.array(datetimes)
img_dict = {}
datetimes = []
jd_list = []
yield (img_stack_dict, bulkstart, self.currentdate,
datetimestack, jd_stack)
# reset image counter
read_images = 0
if len(datetimes) > 0:
img_stack_dict = {}
if len(jd_list) != 0:
jd_stack = np.ma.vstack(jd_list)
else:
jd_stack = None
for key in img_dict:
img_stack_dict[key] = np.vstack(img_dict[key])
img_dict[key] = None
datetimestack = np.array(datetimes)
yield (img_stack_dict, bulkstart, self.currentdate, datetimestack,
jd_stack)
if __name__ == '__main__':
import rsdata.GLDAS_NOAH.interface as GLDAS
import rsdata.root_path as root
# outputpath = os.path.join(root.d, 'GIO GL', 'Evolution', 'SWI_NRT_validation', 'GLDAS',
# 'raw')
outputpath = os.path.join(root.d, 'img2ts_test')
start = datetime(2014, 2, 5)
end = datetime(2014, 4, 21)
img2ts = Img2Ts(GLDAS.GLDAS025Img(parameter=['086_L1', '086_L2',
'086_L3', '086_L4', '131',
'132', '138', '085_L1',
'065', '057']),
outputpath, start, end, imgbuffer=15)
img2ts.calc()