The Vertica Forum recently got a makeover! Let us know what you think by filling out this short, anonymous survey.

Problem with two R UDFs

Next two R UDFs do not work. I can send You .R files and data files used. My e-mail is [email protected] for both I receive next error message: select R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation(DateTime,Value) OVER() from DM.Demo_DataQuality1: imageimage and for select R_UDF_Time_Series_Outliers_Imputation(DateTime,Value) OVER() from DM.Demo_DataQuality2 imageimage R code: # R UDF NR 1 # Imputation for all missing values and records (datetime is missing) in time series R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation <- function(DF) { # Loading libraries require(plyr,quietly=TRUE) require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) # Function for imputating missing values and records for given data frame my.func <- function(df){ # Data with missing records # Start datetime Str.START.DATETIME <- df[1,DateTime_Var] # End datetime Str.END.DATETIME <- df[nrow(df),DateTime_Var] # Creating contigous sequence of datetimes as vector and actual time series if (ISDATE == "YES") { full.datetimes.xts <- seq(from = as.Date(Str.START.DATETIME, format = TIME_DATE_FORMAT), to = as.Date(Str.END.DATETIME, format = TIME_DATE_FORMAT), by=1 ) } else { full.datetimes.xts <- seq(from = as.POSIXct(strptime(Str.START.DATETIME, TIME_DATE_FORMAT)), to = as.POSIXct(strptime(Str.END.DATETIME, TIME_DATE_FORMAT)), by = 3600 ) } actual.data.xts <- xts(df[[Value_Var]], as.POSIXct(strptime(df[[DateTime_Var]],TIME_DATE_FORMAT)) ) # Creating contigous sequence of datetimes as 'xts' object full.datetimes.xts <- xts(,full.datetimes.xts) # Creating whole time series containing also initialy missing records which values will be marked as 'NA' all.data.xts <- merge.xts(actual.data.xts, full.datetimes.xts, all=TRUE ) # Imputing missing values if (Impute_method == 'na.spline') all.data.imputed <- as.data.frame(na.spline(all.data.xts)) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'actual.data.xts')] names(all.data.imputed) = names(df)[1:2] return(all.data.imputed) } # Reformating datetime DF[DateTime_Var] <- as.character(DF[[DateTime_Var]]) # Finding missing values and records if (is.null(Cross_Vars)) { # Sorting data frame DF <- DF[order(DF[[DateTime_Var]]),] # Create unique ID for non cross sectional data DF$ID <- 1:nrow(DF) # Finding missing values and records and creating output data frame DF.OUT <- my.func(DF) # Re-arrange columns DF.OUT <- DF.OUT[,c(DateTime_Var,Value_Var)] } else { # Create unique ID for cross sectional data by concatenating values of table key # Converting all variables in 'Cross_Vars' to character # Creating vector of all cross variables Cross_Vars_Vec <- unlist(strsplit(Cross_Vars,split="-")) # Converting all variables for (my.var in Cross_Vars_Vec) DF[my.var] <- as.character(DF[my.var]) # Creating ID variable and putting it into dataframe DF$ID <- as.vector(apply(DF[Cross_Vars_Vec],1,function(x) paste(x,sep='_'))) # Sorting data frame DF <- DF[order(DF[["ID"]],DF[[DateTime_Var]]),] # Finding missing values and records for each subset by ID and creating output data frame DF.OUT <- plyr::ddply(DF, .(ID), my.func) # Re-arrange columns # DF.OUT <- } return(DF.OUT) } R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) } # R UDF NR 2 # Imputation of outliers in time series found using STL decomposition of time series /local regression + IQR R_UDF_Time_Series_Outliers_Imputation <- function(DF) { # Loading libraries require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) PLOT <- TRUE FILENAME.JPG <- 'R_UDF_Time_Series_Outliers_Imputation_Example.jpg' WIDTH <- 1600 HEIGHT <- 900 x.xts <- xts(DF[[Value_Var]], as.POSIXct(strptime(DF[[DateTime_Var]],TIME_DATE_FORMAT)) ) x.ts <- as.ts(x.xts) if(frequency(x.ts)>1) resid <- stl(x.ts,s.window="periodic",robust=TRUE)$time.series[,3] else { tt <- 1:length(x.ts) resid <- residuals(loess(x.ts ~ tt)) } resid.q <- quantile(resid,prob=c(0.25,0.75)) iqr <- diff(resid.q) limits <- resid.q + 1.5*iqr*c(-1,1) score <- abs(pmin((resid-limits[1])/iqr,0) + pmax((resid - limits[2])/iqr,0)) vec.scores <- as.vector(score) my.seq <- 1:length(vec.scores) index.outliers <- my.seq[vec.scores > 0] # creating output data frame if (Impute_method == 'na.spline') if (any(vec.scores > 0)) { # Recoding all outliers into 'NA' x.xts.na <- x.xts x.xts.na[index.outliers] <- NA x.imp.xts <- na.spline(x.xts.na) all.data.imputed <- as.data.frame(x.imp.xts) } else all.data.imputed <- as.data.frame(x.xts) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'V1')] names(all.data.imputed) = c(DateTime_Var,Value_Var) if(PLOT) if (any(vec.scores > 0)){ jpeg(FILENAME.JPG, width=WIDTH, height=HEIGHT) layout(1:2) # plot before imputation plot(x.xts, main='Time series with outliers') # plot after imputation plot(x.imp.xts, main='Time series after smoothing of outliers') dev.off() } return(all.data.imputed) } R_UDF_Time_Series_Outliers_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Outliers_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) }

Comments

  • R code (in the first post there is incorrect one): # R UDF NR 1 # Imputation for all missing values and records (datetime is missing) in time series R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation <- function(DF) { # Loading libraries require(plyr,quietly=TRUE) require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) # Function for imputating missing values and records for given data frame my.func <- function(df){ # Data with missing records # Start datetime Str.START.DATETIME <- df[1,DateTime_Var] # End datetime Str.END.DATETIME <- df[nrow(df),DateTime_Var] # Creating contigous sequence of datetimes as vector and actual time series if (ISDATE == "YES") { full.datetimes.xts <- seq(from = as.Date(Str.START.DATETIME, format = TIME_DATE_FORMAT), to = as.Date(Str.END.DATETIME, format = TIME_DATE_FORMAT), by=1 ) } else { full.datetimes.xts <- seq(from = as.POSIXct(strptime(Str.START.DATETIME, TIME_DATE_FORMAT)), to = as.POSIXct(strptime(Str.END.DATETIME, TIME_DATE_FORMAT)), by = 3600 ) } actual.data.xts <- xts(df[[Value_Var]], as.POSIXct(strptime(df[[DateTime_Var]],TIME_DATE_FORMAT)) ) # Creating contigous sequence of datetimes as 'xts' object full.datetimes.xts <- xts(,full.datetimes.xts) # Creating whole time series containing also initialy missing records which values will be marked as 'NA' all.data.xts <- merge.xts(actual.data.xts, full.datetimes.xts, all=TRUE ) # Imputing missing values if (Impute_method == 'na.spline') all.data.imputed <- as.data.frame(na.spline(all.data.xts)) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'actual.data.xts')] names(all.data.imputed) = names(df)[1:2] return(all.data.imputed) } # Reformating datetime DF[DateTime_Var] <- as.character(DF[[DateTime_Var]]) # Finding missing values and records if (is.null(Cross_Vars)) { # Sorting data frame DF <- DF[order(DF[[DateTime_Var]]),] # Create unique ID for non cross sectional data DF$ID <- 1:nrow(DF) # Finding missing values and records and creating output data frame DF.OUT <- my.func(DF) # Re-arrange columns DF.OUT <- DF.OUT[,c(DateTime_Var,Value_Var)] } else { # Create unique ID for cross sectional data by concatenating values of table key # Converting all variables in 'Cross_Vars' to character # Creating vector of all cross variables Cross_Vars_Vec <- unlist(strsplit(Cross_Vars,split="-")) # Converting all variables for (my.var in Cross_Vars_Vec) DF[my.var] <- as.character(DF[my.var]) # Creating ID variable and putting it into dataframe DF$ID <- as.vector(apply(DF[Cross_Vars_Vec],1,function(x) paste(x,sep='_'))) # Sorting data frame DF <- DF[order(DF[["ID"]],DF[[DateTime_Var]]),] # Finding missing values and records for each subset by ID and creating output data frame DF.OUT <- plyr::ddply(DF, .(ID), my.func) # Re-arrange columns # DF.OUT <- } return(DF.OUT) } R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Missing_Values_And_Records_Cubic_Splines_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) } # R UDF NR 2 # Imputation of outliers in time series found using STL decomposition of time series /local regression + IQR R_UDF_Time_Series_Outliers_Imputation <- function(DF) { # Loading libraries require(xts,quietly=TRUE) # Parameters: TIME_DATE_FORMAT <- '%d/%m/%Y %H:%M:%S' ISDATE <- 'NO' Cross_Vars <- NULL Impute_method='na.spline' DateTime_Var <- 'DateTime' Value_Var <- 'Value' names(DF) <- c(DateTime_Var,Value_Var) PLOT <- TRUE FILENAME.JPG <- 'R_UDF_Time_Series_Outliers_Imputation_Example.jpg' WIDTH <- 1600 HEIGHT <- 900 x.xts <- xts(DF[[Value_Var]], as.POSIXct(strptime(DF[[DateTime_Var]],TIME_DATE_FORMAT)) ) x.ts <- as.ts(x.xts) if(frequency(x.ts)>1) resid <- stl(x.ts,s.window="periodic",robust=TRUE)$time.series[,3] else { tt <- 1:length(x.ts) resid <- residuals(loess(x.ts ~ tt)) } resid.q <- quantile(resid,prob=c(0.25,0.75)) iqr <- diff(resid.q) limits <- resid.q + 1.5*iqr*c(-1,1) score <- abs(pmin((resid-limits[1])/iqr,0) + pmax((resid - limits[2])/iqr,0)) vec.scores <- as.vector(score) my.seq <- 1:length(vec.scores) index.outliers <- my.seq[vec.scores > 0] # creating output data frame if (Impute_method == 'na.spline') if (any(vec.scores > 0)) { # Recoding all outliers into 'NA' x.xts.na <- x.xts x.xts.na[index.outliers] <- NA x.imp.xts <- na.spline(x.xts.na) all.data.imputed <- as.data.frame(x.imp.xts) } else all.data.imputed <- as.data.frame(x.xts) all.data.imputed[DateTime_Var] <- row.names(all.data.imputed) all.data.imputed <- all.data.imputed[c(DateTime_Var,'V1')] names(all.data.imputed) = c(DateTime_Var,Value_Var) if(PLOT) if (any(vec.scores > 0)){ jpeg(FILENAME.JPG, width=WIDTH, height=HEIGHT) layout(1:2) # plot before imputation plot(x.xts, main='Time series with outliers') # plot after imputation plot(x.imp.xts, main='Time series after smoothing of outliers') dev.off() } return(all.data.imputed) } R_UDF_Time_Series_Outliers_Imputation_Factory <- function() { list(name=R_UDF_Time_Series_Outliers_Imputation, udxtype=c("transform"), intype=c("varchar","numeric"), outtype=c("varchar","numeric"), outnames=c("DateTime","Value") ) }

Leave a Comment

BoldItalicStrikethroughOrdered listUnordered list
Emoji
Image
Align leftAlign centerAlign rightToggle HTML viewToggle full pageToggle lights
Drop image/file

Can't find what you're looking for? Search the Vertica Documentation, Knowledge Base, or Blog for more information.