Source code for repurpose.img2ts

# Copyright (c) 2020, TU Wien, Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#    * Redistributions of source code must retain the above copyright notice,
#      this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of TU Wien, Department of Geodesy and Geoinformation
#      nor the names of its contributors may be used to endorse or promote
#      products derived from this software without specific prior written
#      permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL TU WIEN DEPARTMENT OF GEODESY AND
# GEOINFORMATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


import pynetcf.time_series as nc
import pygeogrids.grids as grids
import repurpose.resample as resamp
import numpy as np
import os
from datetime import datetime
import logging
import pygeogrids
import pygeogrids.netcdf as grid2nc


[docs]class Img2TsError(Exception): pass
[docs]class Img2Ts(object): """ class that uses the read_img iterator of the input_data dataset to read all images between startdate and enddate and saves them in netCDF time series files according to the given netCDF class and the cell structure of the outputgrid Parameters ---------- input_dataset : DatasetImgBase like class instance must implement a ``read(date, **input_kwargs)`` iterator that returns a `pygeobase.object_base.Image`. outputpath : string path where to save the time series to startdate : date date from which the time series should start. Of course images have to be available from this date onwards. enddate : date date when the time series should end. Images should be availabe up until this date input_kwargs : dict, optional keyword arguments which should be used in the read_img method of the input_dataset input_grid : grid instance as defined in :module:`pytesmo.grids.grid`, optional the grid on which input data is stored. If not given then the grid of the input dataset will be used. If the input dataset has no grid object then resampling to the target_grid is performed. target_grid : grid instance as defined in :module:`pytesmo.grids.grid`, optional the grid on which the time series will be stored. If not given then the grid of the input dataset will be used imgbuffer : int, optional number of days worth of images that should be read into memory before a time series is written. This parameter should be chosen so that the memory of your machine is utilized. It depends on the daily data volume of the input dataset variable_rename : dict, optional if the variables should have other names than the names that are returned as keys in the dict by the daily_images iterator. A dictionary can be provided that changes these names for the time series. unlim_chunksize : int, optional netCDF chunksize for unlimited variables. cellsize_lat : float, optional if outgrid or input_data.grid are not cell grids then the cellsize in latitude direction can be specified here. Default is 1 global cell. cellsize_lon : float, optional if outgrid or input_data.grid are not cell grids then the cellsize in longitude direction can be specified here. Default is 1 global cell. r_methods : string or dict, optional resample methods to use if resampling is necessary, either 'nn' for nearest neighbour or 'custom' for custom weight function. Can also be a dictionary in which the method is specified for each variable r_weightf : function or dict, optional if r_methods is custom this function will be used to calculate the weights depending on distance. This can also be a dict with a separate weight function for each variable. r_min_n : int, optional Minimum number of neighbours on the target_grid that are required for a point to be resampled. r_radius : float, optional resample radius in which neighbours should be searched given in meters r_neigh : int, optional maximum number of neighbours found inside r_radius to use during resampling. If more are found the r_neigh closest neighbours will be used. r_fill_values : number or dict, optional if given the resampled output array will be filled with this value if no valid resampled value could be computed, if not a masked array will be returned can also be a dict with a fill value for each variable filename_templ : string, optional filename template must be a string with a string formatter for the cell number. e.g. '%04d.nc' will translate to the filename '0001.nc' for cell number 1. gridname : string, optional filename of the grid which will be saved as netCDF global_attr : dict, optional global attributes for each file ts_attributes : dict, optional dictionary of attributes that should be set for the netCDF time series. Can be either a dictionary of attributes that will be set for all variables in input_data or a dictionary of dictionaries. In the second case the first dictionary has to have a key for each variable returned by input_data and the second level dictionary will be the dictionary of attributes for this time series. ts_dtype : numpy.dtype or dict of numpy.dtypes data type to use for the time series, if it is a dict then a key must exist for each variable returned by input_data. Default : None, no change from input data time_units : string, optional units the time axis is given in. Default: "days since 1858-11-17 00:00:00" which is modified julian date for regular images this can be set freely since the conversion is done automatically, for images with irregular timestamp this will be ignored for now zlib: boolean, optional if True the saved netCDF files will be compressed Default: True """ def __init__(self, input_dataset, outputpath, startdate, enddate, input_kwargs={}, input_grid=None, target_grid=None, imgbuffer=100, variable_rename=None, unlim_chunksize=100, cellsize_lat=180.0, cellsize_lon=360.0, r_methods='nn', r_weightf=None, r_min_n=1, r_radius=18000, r_neigh=8, r_fill_values=None, filename_templ='%04d.nc', gridname='grid.nc', global_attr=None, ts_attributes=None, ts_dtypes=None, time_units="days since 1858-11-17 00:00:00", zlib=True): self.imgin = input_dataset self.zlib = zlib if not hasattr(self.imgin, 'grid'): self.input_grid = input_grid else: self.input_grid = self.imgin.grid if self.input_grid is None and target_grid is None: raise ValueError("Either the input dataset has to have a grid, " "input_grid has to be specified or " "target_grid has to be set") self.input_kwargs = input_kwargs self.target_grid = target_grid if self.target_grid is None: self.target_grid = self.input_grid self.resample = False else: # if input and target grid are not equal resampling is required if self.input_grid != self.target_grid: self.resample = True # if the target grid is not a cell grid make it one # default is just one cell for the entire grid if not isinstance(self.target_grid, grids.CellGrid): self.target_grid = self.target_grid.to_cell_grid(cellsize_lat=cellsize_lat, cellsize_lon=cellsize_lon) self.currentdate = startdate self.startdate = startdate self.enddate = enddate self.imgbuffer = imgbuffer self.outputpath = outputpath self.variable_rename = variable_rename self.unlim_chunksize = unlim_chunksize self.gridname = gridname self.r_methods = r_methods self.r_weightf = r_weightf self.r_min_n = r_min_n self.r_radius = r_radius self.r_neigh = r_neigh self.r_fill_values = r_fill_values # if each image has only one timestamp then we are handling # time series of type Orthogonal multidimensional array representation # according to the CF conventions self.orthogonal = None self.filename_templ = filename_templ self.global_attr = global_attr self.ts_attributes = ts_attributes self.ts_dtypes = ts_dtypes self.time_units = time_units self.non_ortho_time_units = "days since 1858-11-17 00:00:00"
[docs] def calc(self): """ go through all images and retrieve a stack of them then go through all grid points in cell order and write to netCDF file """ # save grid information in file grid2nc.save_grid( os.path.join(self.outputpath, self.gridname), self.target_grid) for img_stack_dict, start, end, dates, jd_stack in self.img_bulk(): #================================================================== start_time = datetime.now() for cell in self.target_grid.get_cells(): cell_gpis, cell_lons, cell_lats = self.target_grid.grid_points_for_cell( cell) # look where in the subset the data is cell_index = np.where( cell == self.target_grid.activearrcell)[0] if cell_index.size == 0: raise Img2TsError('cell not found in grid subset') data = {} for key in img_stack_dict: # rename variable in output dataset if self.variable_rename is None: var_new_name = str(key) else: var_new_name = self.variable_rename[key] output_array = np.swapaxes( img_stack_dict[key][:, cell_index], 0, 1) # change dtypes of output time series if self.ts_dtypes is not None: if type(self.ts_dtypes) == dict: output_dtype = self.ts_dtypes[key] else: output_dtype = self.ts_dtypes output_array = output_array.astype(output_dtype) data[var_new_name] = output_array if self.orthogonal: with nc.OrthoMultiTs( os.path.join(self.outputpath, self.filename_templ % cell), n_loc=cell_gpis.size, mode='a', zlib=self.zlib, unlim_chunksize=self.unlim_chunksize, time_units=self.time_units) as dataout: # add global attributes to file if self.global_attr is not None: for attr in self.global_attr: dataout.add_global_attr( attr, self.global_attr[attr]) dataout.add_global_attr( 'geospatial_lat_min', np.min(cell_lats)) dataout.add_global_attr( 'geospatial_lat_max', np.max(cell_lats)) dataout.add_global_attr( 'geospatial_lon_min', np.min(cell_lons)) dataout.add_global_attr( 'geospatial_lon_max', np.max(cell_lons)) dataout.write_all(cell_gpis, data, dates, lons=cell_lons, lats=cell_lats, attributes=self.ts_attributes) elif not self.orthogonal: with nc.IndexedRaggedTs(os.path.join(self.outputpath, self.filename_templ % cell), n_loc=cell_gpis.size, mode='a', zlib=self.zlib, unlim_chunksize=self.unlim_chunksize, time_units=self.non_ortho_time_units) as dataout: # add global attributes to file if self.global_attr is not None: for attr in self.global_attr: dataout.add_global_attr( attr, self.global_attr[attr]) dataout.add_global_attr( 'geospatial_lat_min', np.min(cell_lats)) dataout.add_global_attr( 'geospatial_lat_max', np.max(cell_lats)) dataout.add_global_attr( 'geospatial_lon_min', np.min(cell_lons)) dataout.add_global_attr( 'geospatial_lon_max', np.max(cell_lons)) # for this dataset we have to loop through the gpis since each time series # can be different in length for i, (gpi, gpi_lon, gpi_lat) in enumerate(zip(cell_gpis, cell_lons, cell_lats)): gpi_data = {} # convert to modified julian date gpi_jd = jd_stack[:, cell_index[i]] - 2400000.5 # remove measurements that were filled with the fill value # during resampling # doing this on the basis of the time variable should # be enought since without time -> no valid # observations if self.resample: if self.r_fill_values is not None: if type(self.r_fill_values) == dict: time_fill_value = self.r_fill_values[ self.time_var] else: time_fill_value = self.r_fill_values valid_mask = gpi_jd != time_fill_value else: valid_mask = np.invert(gpi_jd.mask) gpi_jd = gpi_jd[valid_mask] else: # all are valid if no resampling took place valid_mask = slice(None, None, None) for key in data: gpi_data[key] = data[key][i, valid_mask] if gpi_jd.data.size > 0: dataout.write_ts(gpi, gpi_data, gpi_jd, lon=gpi_lon, lat=gpi_lat, attributes=self.ts_attributes, dates_direct=True) data = {} output_array = None logging.log(logging.INFO, datetime.now() - start_time)
[docs] def img_bulk(self): """ Yields numpy array of self.const.imgbuffer images, start and enddate until all dates have been read Returns ------- img_stack_dict : dict of numpy.array stack of daily images for each variable startdate : date date of first image in stack enddate : date date of last image in stack datetimestack : np.array array of the timestamps of each image jd_stack : np.array or None if None all observations in an image have the same observation timestamp. Otherwise it gives the julian date of each observation in img_stack_dict """ img_dict = {} datetimes = [] jd_list = [] # set start of current imgbulk to startdate bulkstart = self.startdate # image counter read_images = 0 dates = self.imgin.tstamps_for_daterange(self.startdate, self.enddate) for date in dates: try: (input_img, metadata, image_datetime, lon, lat, time_arr) = self.imgin.read(date, **self.input_kwargs) except IOError as e: msg = "I/O error({0}): {1}".format(e.errno, e.strerror) logging.log(logging.INFO, msg) continue read_images += 1 logging.log(logging.INFO, "read" + image_datetime.isoformat()) if self.resample: if time_arr is not None: input_img['jd'] = time_arr input_img = resamp.resample_to_grid(input_img, lon, lat, self.target_grid.activearrlon, self.target_grid.activearrlat, methods=self.r_methods, weight_funcs=self.r_weightf, min_neighbours=self.r_min_n, search_rad=self.r_radius, neighbours=self.r_neigh, fill_values=self.r_fill_values) time_arr = input_img.pop('jd') if time_arr is None: self.time_var = None else: self.time_var = 'jd' if time_arr is not None: # if time_var is not None means that each observation of the # image has its own observation time # this means that the resulting time series is not # regularly spaced in time if self.orthogonal is None: self.orthogonal = False if self.orthogonal: raise Img2TsError("Images can not switch between a fixed image " "timestamp and individual timestamps for each observation") jd_list.append(time_arr) if time_arr is None: if self.orthogonal is None: self.orthogonal = True if not self.orthogonal: raise Img2TsError( "Images can not switch between a fixed image " "timestamp and individual timestamps for each observation") for key in input_img: if key not in img_dict.keys(): img_dict[key] = [] img_dict[key].append(input_img[key]) datetimes.append(image_datetime) if read_images >= self.imgbuffer - 1: img_stack_dict = {} if len(jd_list) != 0: jd_stack = np.ma.vstack(jd_list) jd_list = None else: jd_stack = None for key in img_dict: img_stack_dict[key] = np.vstack(img_dict[key]) img_dict[key] = None datetimestack = np.array(datetimes) img_dict = {} datetimes = [] jd_list = [] yield (img_stack_dict, bulkstart, self.currentdate, datetimestack, jd_stack) # reset image counter read_images = 0 if len(datetimes) > 0: img_stack_dict = {} if len(jd_list) != 0: jd_stack = np.ma.vstack(jd_list) else: jd_stack = None for key in img_dict: img_stack_dict[key] = np.vstack(img_dict[key]) img_dict[key] = None datetimestack = np.array(datetimes) yield (img_stack_dict, bulkstart, self.currentdate, datetimestack, jd_stack)
if __name__ == '__main__': import rsdata.GLDAS_NOAH.interface as GLDAS import rsdata.root_path as root # outputpath = os.path.join(root.d, 'GIO GL', 'Evolution', 'SWI_NRT_validation', 'GLDAS', # 'raw') outputpath = os.path.join(root.d, 'img2ts_test') start = datetime(2014, 2, 5) end = datetime(2014, 4, 21) img2ts = Img2Ts(GLDAS.GLDAS025Img(parameter=['086_L1', '086_L2', '086_L3', '086_L4', '131', '132', '138', '085_L1', '065', '057']), outputpath, start, end, imgbuffer=15) img2ts.calc()