We're Moving!

The Vertica Forum is moving to a new OpenText Analytics Database (Vertica) Community.

Join us there to post discussion topics, learn about

product releases, share tips, access the blog, and much more.

Create My New Community Account Now


Problem with two R UDFs — Vertica Forum

Problem with two R UDFs

Next two R UDFs do not work. I can send You .R files and data files used. My e-mail is Juri.Kuusik@energia.ee for both I receive next error message: select R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation(DateTime,Value) OVER() from DM.Demo_DataQuality1: imageimage and for select R_UDF_Time_Series_Outliers_Imputation(DateTime,Value) OVER() from DM.Demo_DataQuality2 imageimage R code: # R UDF NR 1 # Imputation for all missing values and records (datetime is missing) in time series R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation <- function(DF) { # Loading libraries require(plyr,quietly=TRUE) require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) # Function for imputating missing values and records for given data frame my.func <- function(df){ # Data with missing records # Start datetime Str.START.DATETIME <- df[1,DateTime_Var] # End datetime Str.END.DATETIME <- df[nrow(df),DateTime_Var] # Creating contigous sequence of datetimes as vector and actual time series if (ISDATE == "YES") { full.datetimes.xts <- seq(from = as.Date(Str.START.DATETIME, format = TIME_DATE_FORMAT), to = as.Date(Str.END.DATETIME, format = TIME_DATE_FORMAT), by=1 ) } else { full.datetimes.xts <- seq(from = as.POSIXct(strptime(Str.START.DATETIME, TIME_DATE_FORMAT)), to = as.POSIXct(strptime(Str.END.DATETIME, TIME_DATE_FORMAT)), by = 3600 ) } actual.data.xts <- xts(df[[Value_Var]], as.POSIXct(strptime(df[[DateTime_Var]],TIME_DATE_FORMAT)) ) # Creating contigous sequence of datetimes as 'xts' object full.datetimes.xts <- xts(,full.datetimes.xts) # Creating whole time series containing also initialy missing records which values will be marked as 'NA' all.data.xts <- merge.xts(actual.data.xts, full.datetimes.xts, all=TRUE ) # Imputing missing values if (Impute_method == 'na.spline') all.data.imputed <- as.data.frame(na.spline(all.data.xts)) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'actual.data.xts')] names(all.data.imputed) = names(df)[1:2] return(all.data.imputed) } # Reformating datetime DF[DateTime_Var] <- as.character(DF[[DateTime_Var]]) # Finding missing values and records if (is.null(Cross_Vars)) { # Sorting data frame DF <- DF[order(DF[[DateTime_Var]]),] # Create unique ID for non cross sectional data DF$ID <- 1:nrow(DF) # Finding missing values and records and creating output data frame DF.OUT <- my.func(DF) # Re-arrange columns DF.OUT <- DF.OUT[,c(DateTime_Var,Value_Var)] } else { # Create unique ID for cross sectional data by concatenating values of table key # Converting all variables in 'Cross_Vars' to character # Creating vector of all cross variables Cross_Vars_Vec <- unlist(strsplit(Cross_Vars,split="-")) # Converting all variables for (my.var in Cross_Vars_Vec) DF[my.var] <- as.character(DF[my.var]) # Creating ID variable and putting it into dataframe DF$ID <- as.vector(apply(DF[Cross_Vars_Vec],1,function(x) paste(x,sep='_'))) # Sorting data frame DF <- DF[order(DF[["ID"]],DF[[DateTime_Var]]),] # Finding missing values and records for each subset by ID and creating output data frame DF.OUT <- plyr::ddply(DF, .(ID), my.func) # Re-arrange columns # DF.OUT <- } return(DF.OUT) } R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) } # R UDF NR 2 # Imputation of outliers in time series found using STL decomposition of time series /local regression + IQR R_UDF_Time_Series_Outliers_Imputation <- function(DF) { # Loading libraries require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) PLOT <- TRUE FILENAME.JPG <- 'R_UDF_Time_Series_Outliers_Imputation_Example.jpg' WIDTH <- 1600 HEIGHT <- 900 x.xts <- xts(DF[[Value_Var]], as.POSIXct(strptime(DF[[DateTime_Var]],TIME_DATE_FORMAT)) ) x.ts <- as.ts(x.xts) if(frequency(x.ts)>1) resid <- stl(x.ts,s.window="periodic",robust=TRUE)$time.series[,3] else { tt <- 1:length(x.ts) resid <- residuals(loess(x.ts ~ tt)) } resid.q <- quantile(resid,prob=c(0.25,0.75)) iqr <- diff(resid.q) limits <- resid.q + 1.5*iqr*c(-1,1) score <- abs(pmin((resid-limits[1])/iqr,0) + pmax((resid - limits[2])/iqr,0)) vec.scores <- as.vector(score) my.seq <- 1:length(vec.scores) index.outliers <- my.seq[vec.scores > 0] # creating output data frame if (Impute_method == 'na.spline') if (any(vec.scores > 0)) { # Recoding all outliers into 'NA' x.xts.na <- x.xts x.xts.na[index.outliers] <- NA x.imp.xts <- na.spline(x.xts.na) all.data.imputed <- as.data.frame(x.imp.xts) } else all.data.imputed <- as.data.frame(x.xts) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'V1')] names(all.data.imputed) = c(DateTime_Var,Value_Var) if(PLOT) if (any(vec.scores > 0)){ jpeg(FILENAME.JPG, width=WIDTH, height=HEIGHT) layout(1:2) # plot before imputation plot(x.xts, main='Time series with outliers') # plot after imputation plot(x.imp.xts, main='Time series after smoothing of outliers') dev.off() } return(all.data.imputed) } R_UDF_Time_Series_Outliers_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Outliers_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) }

Comments

  • R code (in the first post there is incorrect one): # R UDF NR 1 # Imputation for all missing values and records (datetime is missing) in time series R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation <- function(DF) { # Loading libraries require(plyr,quietly=TRUE) require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) # Function for imputating missing values and records for given data frame my.func <- function(df){ # Data with missing records # Start datetime Str.START.DATETIME <- df[1,DateTime_Var] # End datetime Str.END.DATETIME <- df[nrow(df),DateTime_Var] # Creating contigous sequence of datetimes as vector and actual time series if (ISDATE == "YES") { full.datetimes.xts <- seq(from = as.Date(Str.START.DATETIME, format = TIME_DATE_FORMAT), to = as.Date(Str.END.DATETIME, format = TIME_DATE_FORMAT), by=1 ) } else { full.datetimes.xts <- seq(from = as.POSIXct(strptime(Str.START.DATETIME, TIME_DATE_FORMAT)), to = as.POSIXct(strptime(Str.END.DATETIME, TIME_DATE_FORMAT)), by = 3600 ) } actual.data.xts <- xts(df[[Value_Var]], as.POSIXct(strptime(df[[DateTime_Var]],TIME_DATE_FORMAT)) ) # Creating contigous sequence of datetimes as 'xts' object full.datetimes.xts <- xts(,full.datetimes.xts) # Creating whole time series containing also initialy missing records which values will be marked as 'NA' all.data.xts <- merge.xts(actual.data.xts, full.datetimes.xts, all=TRUE ) # Imputing missing values if (Impute_method == 'na.spline') all.data.imputed <- as.data.frame(na.spline(all.data.xts)) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'actual.data.xts')] names(all.data.imputed) = names(df)[1:2] return(all.data.imputed) } # Reformating datetime DF[DateTime_Var] <- as.character(DF[[DateTime_Var]]) # Finding missing values and records if (is.null(Cross_Vars)) { # Sorting data frame DF <- DF[order(DF[[DateTime_Var]]),] # Create unique ID for non cross sectional data DF$ID <- 1:nrow(DF) # Finding missing values and records and creating output data frame DF.OUT <- my.func(DF) # Re-arrange columns DF.OUT <- DF.OUT[,c(DateTime_Var,Value_Var)] } else { # Create unique ID for cross sectional data by concatenating values of table key # Converting all variables in 'Cross_Vars' to character # Creating vector of all cross variables Cross_Vars_Vec <- unlist(strsplit(Cross_Vars,split="-")) # Converting all variables for (my.var in Cross_Vars_Vec) DF[my.var] <- as.character(DF[my.var]) # Creating ID variable and putting it into dataframe DF$ID <- as.vector(apply(DF[Cross_Vars_Vec],1,function(x) paste(x,sep='_'))) # Sorting data frame DF <- DF[order(DF[["ID"]],DF[[DateTime_Var]]),] # Finding missing values and records for each subset by ID and creating output data frame DF.OUT <- plyr::ddply(DF, .(ID), my.func) # Re-arrange columns # DF.OUT <- } return(DF.OUT) } R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) } # R UDF NR 2 # Imputation of outliers in time series found using STL decomposition of time series /local regression + IQR R_UDF_Time_Series_Outliers_Imputation <- function(DF) { # Loading libraries require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) PLOT <- TRUE FILENAME.JPG <- 'R_UDF_Time_Series_Outliers_Imputation_Example.jpg' WIDTH <- 1600 HEIGHT <- 900 x.xts <- xts(DF[[Value_Var]], as.POSIXct(strptime(DF[[DateTime_Var]],TIME_DATE_FORMAT)) ) x.ts <- as.ts(x.xts) if(frequency(x.ts)>1) resid <- stl(x.ts,s.window="periodic",robust=TRUE)$time.series[,3] else { tt <- 1:length(x.ts) resid <- residuals(loess(x.ts ~ tt)) } resid.q <- quantile(resid,prob=c(0.25,0.75)) iqr <- diff(resid.q) limits <- resid.q + 1.5*iqr*c(-1,1) score <- abs(pmin((resid-limits[1])/iqr,0) + pmax((resid - limits[2])/iqr,0)) vec.scores <- as.vector(score) my.seq <- 1:length(vec.scores) index.outliers <- my.seq[vec.scores > 0] # creating output data frame if (Impute_method == 'na.spline') if (any(vec.scores > 0)) { # Recoding all outliers into 'NA' x.xts.na <- x.xts x.xts.na[index.outliers] <- NA x.imp.xts <- na.spline(x.xts.na) all.data.imputed <- as.data.frame(x.imp.xts) } else all.data.imputed <- as.data.frame(x.xts) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'V1')] names(all.data.imputed) = c(DateTime_Var,Value_Var) if(PLOT) if (any(vec.scores > 0)){ jpeg(FILENAME.JPG, width=WIDTH, height=HEIGHT) layout(1:2) # plot before imputation plot(x.xts, main='Time series with outliers') # plot after imputation plot(x.imp.xts, main='Time series after smoothing of outliers') dev.off() } return(all.data.imputed) } R_UDF_Time_Series_Outliers_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Outliers_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) }

Leave a Comment

BoldItalicStrikethroughOrdered listUnordered list
Emoji
Image
Align leftAlign centerAlign rightToggle HTML viewToggle full pageToggle lights
Drop image/file