############ Function compile_data() ############################## # # # compile_data(data) -> data is table with x columns, first has to be # # column with time stamp. The sequence of the # # other columns is optional # # The names have to be as follows: # # # # - Q_gas for gas demand in kWh # # - Q_el for electricity demand in kWh # # - Q_dh for heat from district heat in kWh # # - gas for gas consumption in m³ # # - H2O for water consumption in m³ # # - T_i for indoor air temperature of room that is heated # # (winter) and cooled (summer) in °C # # - T_i_heat for indoor air temperature of room heated only, in °C # # - T_i_cool for indoor air temperature of room cooled only, in °C # # - T_a ambient for outdoor air temperature in °C # # - Time for timestamp of day, either showing 15 min or 1 hour # # measurements, format has to be "yyyy.mm.dd hh:mm" # # e.g "2008.04.26 00:15" # # # # # # Description: compiles hourly/15 min consumption values to daily # # average deman values. Temperatures are averaged. # # # ######################################################################### compile.data <- function(data,area) { #Extract Time using timezone UTC to account for data not considering #day light savings z <- strptime(data[,1], format="%Y.%m.%d %H:%M", tz = "UTC") d <- as.POSIXct(z) #error message if(length(d[!is.na(d)]) < NROW(data)) { message("\nError in time format of function compile.data. Either timestamp is not in first column or format is differing from YYYY.mm.dd HH:MM") }else{ days <- format(d,"%Y.%m.%d") #delete Time column data <- data[,-1] #get names of columns data.names <- as.vector(names(data)) #make daily means amount_columns <- length(names(data)) daily.dummy <- as.vector(c()) for(i in 1:amount_columns) { holder <- tapply(as.numeric(data[,i]),list(days),mean, na.rm=T) if(i==1) Time <- as.character(row.names(holder)) daily.dummy <- as.data.frame(cbind(daily.dummy, as.numeric(holder))) } names(daily.dummy) <- c(data.names) daily.dummy <- cbind(daily.dummy,Time, stringsAsFactors=FALSE)[,c(amount_columns+1,1:amount_columns)] #check if 15 min data timeframe.check <- as.numeric(difftime(d[2],d[1]), units="mins") #Q_el 15min: from kWh to W/m² für P_el (*4h*1000W/area) pos.Q_el <- grep("Q_el", names(daily.dummy)) a <- length(pos.Q_el) if(a != 0 & timeframe.check == 15 ) { daily.dummy[,pos.Q_el] <- daily.dummy[,pos.Q_el]*4000/area names(daily.dummy)[pos.Q_el] <- gsub("Q_el","P_el", names(daily.dummy)[pos.Q_el]) } #Q_el 1 hour: from kWh to W/m² für P_el (*1000W/area) if(a != 0 & timeframe.check == 60 ) { daily.dummy[,pos.Q_el] <- daily.dummy[,pos.Q_el]*1000/area #daily.dummy$Q_el <- daily.dummy$Q_el*1000/area names(daily.dummy)[pos.Q_el] <- gsub("Q_el","P_el", names(daily.dummy)[pos.Q_el]) } #Q_dh 15min: from kWh to W/m² für P_dh (*4h*1000W/area) pos.Q_dh <- grep("Q_dh", names(daily.dummy)) a <- length(pos.Q_dh) if(a != 0 & timeframe.check == 15 ) { daily.dummy[,pos.Q_dh] <- daily.dummy[,pos.Q_dh]*4000/area names(daily.dummy)[pos.Q_dh] <-gsub("Q_dh","P_dh", names(daily.dummy)[pos.Q_dh]) } #Q_dh 1 hour: from kWh to W/m² für P_dh (*1000W/area) if(a != 0 & timeframe.check == 60 ) { daily.dummy[,pos.Q_dh] <- daily.dummy[,pos.Q_dh]*1000/area names(daily.dummy)[pos.Q_dh] <- gsub("Q_dh","P_dh", names(daily.dummy)[pos.Q_dh]) } #Q_gas 15min: from kWh to W/m² für P_gas (*4h*1000W/area) pos.Q_gas <- grep("Q_gas", names(daily.dummy)) a <- length(pos.Q_gas) if(a != 0 & timeframe.check == 15 ) { daily.dummy[,pos.Q_gas] <- daily.dummy[,pos.Q_gas]*4000/area names(daily.dummy)[pos.Q_gas] <- gsub("Q_gas","P_gas", names(daily.dummy)[pos.Q_gas]) } #Q_gas 1 hour: from kWh to W/m² für P_gas (*1000W/area) if(a != 0 & timeframe.check == 60 ) { daily.dummy[,pos.Q_gas] <- daily.dummy[,pos.Q_gas]*1000/area names(daily.dummy)[pos.Q_gas] <- gsub("Q_gas","P_gas", names(daily.dummy)[pos.Q_gas]) } #Gas 15min: from m³ to W/m² für P_gas (*10*1000w*4h*0.8/area) pos.Gas <- grep("gas", names(daily.dummy)) a <- length(pos.Gas) if(a != 0 & timeframe.check == 15 ) { daily.dummy$gas<-daily.dummy$gas*32000/area names(daily.dummy)[pos.Gas] <- "P_gas" } #Gas 1 hour: from m³ to W/m² für P_gas (*10*1000w*0.8/area) if(a != 0 & timeframe.check == 60 ) { daily.dummy$gas<-daily.dummy$gas*8000/area names(daily.dummy)[pos.Gas] <- "P_gas" } #H2O 15min: from m³ to L/(d*m²) (*4*1000/area) pos.H2O <- grep("H2O", names(daily.dummy)) a <- length(pos.H2O) if(a != 0 & timeframe.check == 15 ) daily.dummy$H2O <- daily.dummy$H2O*4000*24/area #H2O 1 hour: from m³ to L/(d*m²) (*1000/area) if(a != 0 & timeframe.check == 60 ) daily.dummy[,pos.H2O] <- daily.dummy[,pos.H2O]*1000*24/area #add column delta_Ti_heat pos.Ti_heat <- grep("T_i_heat$", names(daily.dummy)) aa <- length(pos.Ti_heat) if(aa != 0 ) { T_heat <- daily.dummy$T_i_heat delta_T_i_heat <- diff(T_heat, lag=1, differences=1) } #add column delta_Ti pos.Ti <- grep("T_i$", names(daily.dummy)) ab <- length(pos.Ti) if(ab != 0 ) { T_in <- daily.dummy$T_i delta_T_i <- diff(T_in, lag=1, differences=1) } #add column delta_Ti_cool pos.Ti_cool <- grep("T_i_cool$", names(daily.dummy)) ac <- length(pos.Ti_cool) if(ac != 0 ) { T_cool <- daily.dummy$T_i_cool delta_T_i_cool <- diff(T_cool, lag=1, differences=1) } if(aa != 0 | ab != 0 | ac !=0 ) { #necessary to delete first day daily.dummy <- daily.dummy[-1, ] row.names(daily.dummy) <- NULL } if(aa != 0) daily.dummy$delta_T_i_heat <- delta_T_i_heat if(ab != 0) daily.dummy$delta_T_i <- delta_T_i if(ac != 0) daily.dummy$delta_T_i_cool <- delta_T_i_cool daily.data <- daily.dummy daily.data }} ################# Function single.outlier() ########################## # # # single.outlier(data, method, multiplier_sd) -> # # data: data is table with 2 columns # # 1st column: Time with format "%Y.%m.%d" # # 2nd column: Values tested for outliers # # method: 0 or 1 or 2 # # 0: mean is 0 # # 1: mean is mean # # 2: mean is median # # multiplier: numric value indicating how many standard # # deviations values can be located from the # # mean not to be considered an outlier # # Description: # # General outlier detection function # # # # # ######################################################################### single.outlier <- function(data, method, multiplier_sd) { data <- data[!is.na(data[,2]),] f <- strptime(data[,1], format="%Y.%m.%d") date1 <- as.POSIXct(f) dataset <- as.numeric(data[,2]) #n_out <- 0 n_u <- 0.5*(length(dataset)-1) value <- as.vector(c()) date <- as.vector(c()) type <- as.vector(c()) class <- as.vector(c()) n <- length(dataset) for(i in 1:n_u) { if(method == 1) mean_x <- mean(dataset, na.rm=TRUE) if(method == 2) mean_x <- median(dataset, na.rm=TRUE) if(method == 0) mean_x <- as.numeric(0) #sd from the median #sd_x <- sqrt(sum((dataset-mean_x)^2)/(n-1)) sd_x <- sd(dataset, na.rm=TRUE) #sd_x <- sqrt(sum((dataset - mean_x)^2)/(n-1)) if(sd_x == 0){ break print("Standard deviation of variable tested for outlier is 0, can't proceed")} x_min <- min(dataset) a <- mean_x - x_min x_max <- max(dataset) b <- x_max - mean_x if(mean_x-min(dataset) > max(dataset)-mean_x) { d <- -a + mean_x #min d_row <- match(min(dataset), dataset) kind <- "min" diff_to_mean <- a }else{ d <- b + mean_x #max d_row <- match(max(dataset), dataset) kind <- "max" diff_to_mean <- b } if(multiplier_sd*sd_x < diff_to_mean) { class <- append(class, kind, after=i-1) type <- append(type, names(data)[2],after=i-1) value <- append(value, dataset[d_row], after=i-1) date <-append(date, as.POSIXct(date1[d_row]), after=i-1) dataset <- dataset[-d_row] date1 <- date1[-d_row] n <- NROW(dataset) }else{ break } } outlier_found <- data.frame(date, value, type, class) outlier_found } ############ Function feature.vector6() ########################### # # # feature.vector6(data) -> data is table with x columns, first has to be# # column with time stamp. Sequence of other # # columns is optional, the names of columns that # # are selected to generate the features have to # # be named as followed. (The table can be iden- # # tical to the one provided to the compile.data() # # function # # - Q_h for heating demand in kWh # # - Q_el for electricity demand in kWh # # - Q_dh for ditricht heatinh in kWh # # - Gas for gas consumption in m³ # # - H2O for water consumption in m³ # # - Time for timestamp of day, either showing 15 min or 1 hour # # measurements, format has to be "dd.mm.yyyy hh:mm" # # e.g "20.04.2008 00:15" # # # # Description: # # Two features (average and hourly peak consumption within # # one day) are created for electricity, heat and water consumption# # resulting in 6 features for each day # # # ######################################################################### feature.vector6 <- function(data) { #extract time z <- strptime(data[,1], format="%Y.%m.%d %H:%M", tz="UTC") d <- as.POSIXct(z) days.ave <- format(d,"%Y.%m.%d") days.max <- format(d,"%Y.%m.%d %H") #make peak, max feature features.h <- as.vector(c()) for(i in 2:ncol(data)) { holder <- tapply(as.numeric(data[,i]), list(days.max),sum, na.rm=T) if(i==2) Time <- as.character(row.names(holder)) features.h <- as.data.frame(cbind(features.h, as.numeric(holder))) } z.1 <- strptime(Time, "%Y.%m.%d %H") d.1 <- as.POSIXct(z.1) days.max.1 <- format(d.1, "%Y.%m.%d") features.max <- as.vector(c()) for(j in 1:ncol(features.h)) { peak <- tapply(as.numeric(features.h[,j]), list(days.max.1), max, na.rm=T) if(j ==1) time_stamp <- as.character(row.names(peak)) features.max <- as.data.frame(cbind(features.max, as.numeric(peak))) } names(features.max) <- names(data)[2:ncol(data)] names(features.max)<- paste(names(features.max),"max",sep="_") #make average features features.ave <- as.vector(c()) for(k in 2:ncol(data)) { ave <- tapply(as.numeric(data[,k]), list(days.ave), mean, na.rm=T) features.ave <- as.data.frame(cbind(features.ave, as.numeric(ave))) } names(features.ave) <- names(data)[2:ncol(data)] names(features.ave)<- paste(names(features.ave),"ave",sep="_") #combine ave and max features feature_vector <- as.data.frame(cbind(features.max, features.ave)) feature_vector <- cbind(feature_vector, time_stamp, stringsAsFactors=FALSE)[,c(length(feature_vector)+1, 1:length(feature_vector))] names(feature_vector)[1] <- "Time" feature_vector } ################# Function stand.features.training() ################# # # # stand.features.training(data, moving) -> # # data: is result from feature.vector6 function # # moving: size of moving window in days, normaly 7 # # # # Description: Standardizes features # ######################################################################### stand.features.training<- function(data, moving) { #make moving window according to size of "moving" time_vector <- format(as.POSIXct(strptime(data[,1],"%Y.%m.%d")),"%A") n_days <- NROW(data) n_weeks <- n_days/moving week_vector <- rep(1:n_weeks, each=moving) #if diff smaller, week_vector shorter diff <- length(week_vector)-NROW(data) #if true week_vector is shorter, more days than weeks if(diff < 0)week_vector <- append(week_vector, c(rep(max(week_vector)+1, each=abs(diff)))) if(diff > 0)week_vector <- week_vector[1:(length(week_vector)-diff)] maximas_weekly <- aggregate(data[,-1], by=list(week_vector), max) maximas_weekly <- maximas_weekly[,-1] maximas_weekly <- sapply(maximas_weekly, function(a) replace(a, a==0, 0.1)) data1 <- data[,-1]/maximas_weekly[week_vector,] last_maxima <<- maximas_weekly[NROW(maximas_weekly), ] maximas_weekly_training <<- maximas_weekly stand_features <- data.frame(Time=data[,1], data1) } ################# Function features.outlier() ######################## # # # features.outlier(data1, data2, holidays, # # amount_outliers,multiplier_sd) -> # # data1 <- daily_training data table # # data2 <- features_training data table # # holidays <- vector containing holidays # # amount_outliers <- how many outliers have to be # # be found for day to be deleted # # method <- method for selecting the mean # in single.outlier # multiplier_sd <- value for the standard # # deviation in single.outlier # # # # Description: checks features for outliers # ######################################################################### #data1 <- daily_training #data2 <- features_training #amount_outliers <- 1 #multiplier_sd <- 2.9 #method <- 2 #data1 <- daily_training_outl_del #data2 <- stand_features_training #amount_outliers <- 2 #multiplier_sd <- 2.4 #method <- 1 features.outlier <- function(data1, data2, holidays,amount_outliers,method,multiplier_sd) { #Create empty data.frame for single outliers res_outlier_m <- data.frame(date=character(0), value=numeric(0), day=character(0)) d <- strptime(data2[,1], "%Y.%m.%d") z <- as.POSIXct(d) columns <- length(data2) weekdays <- format(d, "%a") #creating list of different 8 clusters list_holidays <- as.list(holidays) index_holidays <- lapply(list_holidays, function(t) match(t,data2[,1])) index_holidays <- as.numeric(index_holidays[!is.na(index_holidays)]) weekdays[index_holidays] <- "Holiday" list_days <- as.vector(unique(weekdays)) feat_outlier <- data.frame(date=character(0), value=numeric(0), type=character(0), class=character(0), day=character(0)) #loop running through days of the week, starting with 0 = Sunday #and creating subsets holding the different days for(i in 1:length(list_days)) { day <- subset(data2, weekdays == list_days[i]) for(j in 2:ncol(day)) { out <- single.outlier(day[,c(1,j)], method, multiplier_sd) if(NROW(out) > 0) { out$day <- as.character(list_days[i]) feat_outlier <- rbind(feat_outlier, out) out <- NULL } } } feat_outlier <- feat_outlier[order(feat_outlier[,1]),] numeric_dates <- as.numeric(feat_outlier[,1]) #returns how many times a certain date has been found to be a outlier struct_dates <- rle(numeric_dates) struct_dates1 <- data.frame(struct_dates$lengths, struct_dates$values) extract_items <- subset(struct_dates1, struct_dates1[,1] >= amount_outliers) numeric_rownames <- as.numeric(row.names(extract_items)) row_index <- vector() if(length(numeric_rownames) >0) { for(i in 1:length(numeric_rownames)) { index <- sum(struct_dates1[1:numeric_rownames[i],1]) row_index <- append(row_index, index) } Dates_out <- as.POSIXct(feat_outlier[row_index,1]) result_features_out <- data.frame(feat_outlier[row_index, c(1,2,5)]) rows_delete <- sapply(unique(Dates_out), function(a) match(a,z)) features_training_outl_del <- data2[-rows_delete,] daily_training_outl_del <- data1[-rows_delete,] }else{ features_training_outl_del <- data2 daily_training_outl_del <- data1 result_features_out <- "no days were found to be outliers" } #Available outside of function features_training_outl_del <<- features_training_outl_del daily_training_outl_del <<- daily_training_outl_del result_features_out } ############ Function clustering() ####################### # # # clustering(data, holidays) -> # # # # data = standardized features, with "Time" in 1st column # # in format "yyyy.mm.dd" # # holidays = vector of holidays, if not available specifiy "NA" # # # # Description: clustering process of training stage # ######################################################################### #data <- stand_features_training clustering <- function(data, holidays) { #aplha value alpha <- 3.2 #"n" gives number of columns of feat_t n <- length(data) #"n_features" gives numbers of features n_features <- length(data)-1 #finding 1st closest cluster #bind column holding all different days of the week to data Sys.setlocale <- Sys.setlocale("LC_TIME", "English") d <- strptime(data[,1], "%Y.%m.%d") #data[,1] <- as.POSIXct(d) list_holidays <- as.list(holidays) index_holidays <- lapply(list_holidays, function(t) match(t,data[,1])) index_holidays <- as.numeric(index_holidays[!is.na(index_holidays)]) data[,1] <- as.POSIXct(d) days <- format(data[,1], "%a") #to include holiday cluster if provided if(length(holidays) > 1) days[index_holidays] <- "Holiday" data <- cbind(data, days) max_cl <- length(unique(days)) #"comb" are all max possible combinations of different clusters #to get dissimilarity coefficients "dc" comb <- combn(unique(as.character(days)),2) #creating empty vector for result "res_dc" dc_hold <- as.vector(c()) res_dc <- data.frame(day1=character(0), day2=character(0)) #loop for all 21 possible combinations and resulting dc for(i in 1:ncol(comb)) { day1 <- as.matrix(subset(data[,2:n], data[,n+1] == comb[1,i])) day2 <- as.matrix(subset(data[,2:n], data[,n+1] == comb[2,i])) library(fields) dc <- sum(rdist(day1, day2))/(NROW(day1)*NROW(day2)) dc_hold <- append(dc_hold, dc, after=i-1) res_dc <- rbind(res_dc, cbind(comb[1,i],comb[2,i])) } res_dc <- cbind(res_dc, dc_hold) names(res_dc) <- c("cl1","cl2","dc_value") #get combination with smallest dc a <- res_dc[order(res_dc[,3]), ] cl_cl1 <- a[1,] cluster <- cl_cl1 #View(res_dc) #3. task: check whether clusters should be merged, creating new merged #clusters until they are too far from each other to be merged #"SS" function to calculate the sum of squared distances of the feature #vectors of two clusters SS <- function(day_data) { ss <- sum(sapply(day_data, function(a) (a-mean(a))^2) ) } update_dc <- function(C_i, C_j, C_k) { dc <- (sum(rdist(C_k, C_i))/(NROW(C_k)*NROW(C_i)))* NROW(C_i)/(NROW(C_i)+NROW(C_j))+ (sum(rdist(C_k, C_j))/(NROW(C_k)* NROW(C_j)))*NROW(C_j)/(NROW(C_i)+NROW(C_j)) } duda <- as.vector(c()) cl_ratio <- as.vector(c()) #loop is run to a maximum of 6 times, the last time is to test if second last cluster #combination should be merged for(i in c((n+1):(n+max_cl-2))) { #Extract name of closest cluster and paste together "merge1" d1 <- as.character(cl_cl1[,1]) d2 <- as.character(cl_cl1[,2]) merg1 <- paste(d1,d2) #check whether cluster should be merged #select days specified in "d1" and "d2" that belong to closest cluster cl1 <- subset(data[,2:n], data[,i] == as.character(cl_cl1[,1])) cl2 <- subset(data[,2:n], data[,i] == as.character(cl_cl1[,2])) #compute value of Duda and Hart stopping rule for "cl1" and "cl2" SS1 <-SS(cl1) SS2 <-SS(cl2) cl1_2 <- rbind(cl1, cl2) SS1_2 <-SS(cl1_2) cl_ratio1 <- (SS1+SS2)/SS1_2 cl_ratio <- append(cl_ratio, cl_ratio1) #, after=i-4) #standard normal score duda1 <- 1- 2/(pi*n_features)-alpha*sqrt(2*(1-8/(pi^2*n_features))/ (NROW(cl1_2)*n_features)) duda <- append(duda, duda1) #, after=i-4) #if condition is TRUE clusters can be merged # & i <=14 if(cl_ratio1 > duda1 & i <=(n+max_cl-3)){ #delete the days belonging to the recently merged clusters #from "res_dc" table res_dc <- subset(res_dc, !(cl1 %in% c(as.character(cl_cl1[,1]), as.character(cl_cl1[,2])) |cl2 %in% c(as.character(cl_cl1[,1]), as.character(cl_cl1[,2])) )) d <- c(as.character(res_dc[,1]),as.character(res_dc[,2])) #create list of days holding all days except the recently merged ones days_left1 <- unique(d) #make table holding all possible new combinations bewteen #new cluster and old days y <- expand.grid(days_left1,merg1) names(y) <- c("cl1","cl2") #Create subsets of the two days to be merged #ci <- subset(data[,2:n], data[,i] == as.character(cl_cl1[,1])) #cj <- subset(data[,2:n], data[,i] == as.character(cl_cl1[,2])) #loop through all possible combinations of new cluster and old days for(j in 1:NROW(y)){ clk <- subset(data[,2:n], data[,i] == as.character(y[j,1])) dc_value <- update_dc(cl1,cl2,clk) dc_new <- y[j, ] dc_new <- cbind(dc_new, dc_value) #add new dc value to remaining unchanged dc values in "res_dc" res_dc <- rbind(res_dc,dc_new) } #Extract closest two cluster a1 <- res_dc[order(res_dc[,3]), ] cl_cl1 <- a1[1,] cluster <- rbind(cluster, cl_cl1) #adding column "merge" holding potential combined cluster to data days[days == d1] <- merg1 days[days == d2] <- merg1 merge <- days data <- cbind(data, merge) }else{ if(cl_ratio1 > duda1){ #adding column "merge" holding potential combined cluster to data days[days == d1] <- merg1 days[days == d2] <- merg1 merge <- days data <- cbind(data, merge) #test last two clusters res_dc <- unique(as.character(data[,i+1])) names(res_dc) <- c("cl1","cl2") res_dc <- as.data.frame(t(res_dc)) clk <- subset(data[,2:n], !(data[,i] == as.character(cl_cl1[,1])| data[,i] == as.character(cl_cl1[,2]))) dc_value <- update_dc(cl1,cl2,clk) res_dc <- cbind(res_dc, dc_value) cl_cl1 <- res_dc cluster <- rbind(cluster, res_dc) #check whether last two clusters should be combined cl1 <- subset(data[,2:n], data[,i+1] == as.character(cl_cl1[,1])) cl2 <- subset(data[,2:n], data[,i+1] == as.character(cl_cl1[,2])) SS1 <-SS(cl1) SS2 <-SS(cl2) cl1_2 <- rbind(cl1, cl2) SS1_2 <-SS(cl1_2) cl_ratio1 <- (SS1+SS2)/SS1_2 cl_ratio <- append(cl_ratio, cl_ratio1) #, after=i-4) #standard normal score duda1 <- 1- 2/(pi*n_features)-alpha*sqrt(2*(1-8/(pi^2*n_features))/ (NROW(cl1_2)*n_features)) duda <- append(duda, duda1)#, after=i-4) if(cl_ratio1 > duda1) { d1 <- as.character(cl_cl1[,1]) d2 <- as.character(cl_cl1[,2]) merg1 <- paste(d1,d2) days[days == d1] <- merg1 days[days == d2] <- merg1 merge <- days data <- cbind(data, merge) last_cl <- data.frame(cl1 = merg1, cl2 = merg1, dc_value= "NA") cluster <- rbind(cluster, last_cl) duda <- append(duda, "NA") #, after=i-2) cl_ratio <- append(cl_ratio, "NA") #, after=i-2) } }else{ break }} } cluster_names <<- unique(data[,ncol(data)]) cluster_table <<- data cluster<-cbind(cluster, cbind(cl_ratio, duda)) } ############ Function multiple.regression() #################### # # # multiple.regression(target_variable, training_data) -> # # # # target_variable = either "gas", "electricity", "district_heat"# # or "water" # # training_data = data table with column day_types # # # # Description: function for the regression model in the training # # stage. # # # ######################################################################### #training_data <- regression_data multiple.regression <-function(target_variable, training_data ) { if(target_variable == "gas") model <- "P_gas ~ high*day_types / (I(T_a-CP) + P_el + H2O + T_i_heat + delta_T_i_heat)" if(target_variable == "electricity") model <- "P_el_splits ~ high*day_types / (I(T_a-CP) + P_dh + H2O + T_i_cool + delta_T_i_cool)" #if(target_variable == "electricity") # model <- "P_el ~ high*day_types / (I(T_a-CP) + P_gas + H2O + # T_i_cool + delta_T_i_cool)" if(target_variable == "district_heat") model <- "P_dh ~ high*day_types / (I(T_a-CP) + P_el + H2O + T_i_heat + delta_T_i_heat)" #model <- "P_dh ~ high*day_types / (P_el + H2O # )" if(target_variable == "water") model <- "H2O ~ high*day_types / (I(T_a-CP) + P_el + T_i_cool + P_gas+ delta_T_i_cool)" regression <- function(CP, formul) { training_data$high <<- training_data$T_a >= CP #lm is stored in mod training_data<-na.omit(training_data) #lm_model <<- lm(as.formula(formul), data=training_data) lm_model <<- step(lm(as.formula(formul), data=training_data),trace=0) #summary is stored in summa summa <<-summary(lm_model) #value given to optimize function summa$adj.r.squared } opt_CP <- optimize(regression, c(3,30), tol=0.001, maximum=TRUE, formul=model) CP<<-opt_CP$maximum adj.r.squared <<-summa$adj.r.squared lm_model } ############ Function add.daytypes() ################################# # # # add.daytypes(daytypes, data, holidays) -> # # daytypes = current daytypes, result of clustering cluster_names # # data = data with first time column for which daytypes should be made# # holidays = holiday vector # # # # Description: Adds daytypes to data table so that regression can be run. # # The daytypes are result of clustering function. # # # # # ############################################################################### add.daytypes <- function(day_type_names, data, holidays){ days <-format(as.POSIXct(strptime(data[,1], "%Y.%m.%d")),"%a") list_holidays <- as.list(holidays) index_holidays <- lapply(list_holidays, function(m) match(m,data[,1])) index_holidays <- as.numeric(index_holidays[!is.na(index_holidays)]) days[index_holidays] <- "Holiday" col_daytypes <- sapply(days, function(d) grep(d,day_type_names)) day_types <- day_type_names[col_daytypes] data$day_types <- day_types data } ############ Function plot.regression.training() ####################### # # # plot.regression.training(CP, lm_model, data, outlier, daytypes) # # CP = change point from regression_model # # data = daily values of measurements, result of compile.data function # # outlier = outliers found in features and deleted before regression # # daytypes = current daytypes, result of clustering cluster_names # # # # Description: Plots the result of the regression model in the # # training stage # # # ################################################################################# #lm_model <- regression_model #data <- daily_training[46:NROW(daily_training),] #data <- daily_training[1:45,] #data <- daily_training[9:54,] #outlier <- result_features_out #daytypes <- cluster_names plot.regression.training <- function(CP, lm_model, data, outlier, daytypes) { #get target variable's name a <- lm_model$call b <- as.list(a$formula) Y <- b[[2]] #make necessary columns for predict data$high <- data$T_a >= CP #make day_types according to last clusterprocess data <- add.daytypes(daytypes, data, holidays) predict1 <- lm_model$fitted.values #predicts all days of training also the ones found as outliers in features fitted_values <- predict(lm_model, newdata = data) fitted_values[fitted_values < 0] <- 0 #measured values measured_values <- data.frame(Time = data[,1], data[,names(data)==Y] ) names(measured_values)[2] <- as.character(Y) #get outliers of features outlier <- outlier[order(outlier[,1]),] find_dates <- format(outlier[,1],"%Y.%m.%d") outlier_value <- data[which(data[,1] %in% find_dates),names(data)==Y] #create time axis a <- as.POSIXct(strptime(data[,1], "%Y.%m.%d"), format="%Y-%m-%d") max1 <- max(fitted_values, na.rm=T) max2 <- max(measured_values[,2]) if(max1 > max2) { maximum <- max1 }else{ maximum <- max2 } #for color vector if(length(cluster_names) > 2) { data1 <- as.character(data[,ncol(data)]) data1[which(data1%in%as.character(cluster_names[1]))] <- 2 data1[which(data1%in%as.character(cluster_names[2]))] <- 1 data1[which(data1%in%as.character(cluster_names[3]))] <- "royalblue" data$day_types <- data1 color_vec <- data$day_types } color_vec <- as.character(data[,ncol(data)]) for(i in 1:NROW(color_vec)) { color_vec[which(color_vec %in% "Holiday Sun Sat")] <- 2 color_vec[which(color_vec != 2)] <- 1 } par(cex=1.3 ,mar=c(5.1,4.1,8.6,0.2),xpd=TRUE ) plot(a, measured_values[,2], xaxt="n", pch=16, xlab="Time", ylab=paste("demand of ",Y," in W/m²"), col=color_vec, #ylim=c(0,3.5)) ylim=c(0,(maximum+maximum*0.1)) ) lines(a, measured_values[,2],xaxt="n", type="l", lwd=1.6) points(a, fitted_values, xaxt="n", pch=1, col=color_vec) lines(a, fitted_values, xaxt="n", type="l", lty="dotted") #points(outlier[,1], outlier_value, xaxt="n", pch=8, col="blue", ps=4) axis.POSIXct(1, at = seq(min(a), max(a), by = "1 day"), format="%d.%m") grid(nx = NA, ny = NULL, col = "grey",lwd = par("lwd"), lty = "dotted", equilogs=TRUE) if(length(cluster_names) > 2) { legend(x=min(a),y=maximum+maximum*0.38, ncol= 2,cex=0.95, x.intersp = 0.2, c("measured time series","fitted time series", "working days", "sundays/holidays", "saturdays", "outlier in features"), lty=c(1,2, 0,0,0,0), pch=c(NA,NA,16,16,16,1,1,1,NA), col=c( "black","black","black","red","royalblue", NA), box.lty=0) }else{ legend(x=min(a),y=maximum+maximum*0.75, ncol= 1,cex=0.95, x.intersp = 0.2, c("measured time series","fitted time series", "working days", "sundays/holidays", "saturdays", "outlier in features - not used to train regression model"), lty=c(1,2, 0,0,0,0), pch=c(20,20,20,20,20,NA), col=c( "black","black","black","red","royalblue", NA), box.lty=0) } } ############ Function dc.coefficient.check() ######################### # # # dc.coefficient.check(new_feat, cluster_names, cluster_table) # # # # new_feat = features of new day # # cluster_names = result of clustering function # # cluster_table = result of clustering function # # # # Description: Compares the dissimiliarity coefficients of the new day # # to the different clusters and decides to which cluster # # the day should belong to. # # # # # ############################################################################### dc.coefficient.check <- function(new_feat, cluster_names, cluster_data){ d <- strptime(new_feat[,1], format="%Y.%m.%d") day_POS <- as.POSIXct(d) type <- format(day_POS, "%a") day_belongs_cluster <- cluster_names[grep(type, cluster_names)] cluster_data2 <- add.daytypes(cluster_names,cluster_data,holidays) potential_cl <- data.frame(day_type = as.character(day_belongs_cluster), Date = as.character(d), day = type) all_dc_coefficients <- dc.coefficients(cluster_data2, new_feat, potential_cl) #smallest_dc <- compare.dc(cluster, all_dc_coefficients, potential_cl) smallest_dc <- all_dc_coefficients[which(all_dc_coefficients[,3] == min(all_dc_coefficients[,3])),] SS <- function(day_data) { ss <- sum(sapply(day_data, function(a) (a-mean(a))^2) ) } cl1 <- new_feat[,2:(ncol(new_feat))] cl2 <- subset(cluster_data2[,2:(ncol(cluster_data2)-1)], cluster_data2[,ncol(cluster_data2)] == smallest_dc[1,1] ) SS1 <-SS(cl1) SS2 <-SS(cl2) cl1_2 <- rbind(cl1, cl2) SS1_2 <-SS(cl1_2) cl_ratio1 <- (SS1+SS2)/SS1_2 alpha <- 3.2 n_features <- length(cluster_data)-2 #standard normal score duda1 <- 1- 2/(pi*n_features)-alpha*sqrt(2*(1-8/(pi^2*n_features))/ (NROW(cl1_2)*n_features)) if(cl_ratio1 > duda1) { result <- data.frame(Date=potential_cl[,2],day=potential_cl[,3], cluster=smallest_dc[1,1]) }else{ result <- data.frame(Date=potential_cl[,2],day=potential_cl[,3], cluster="unknown daytype") } result } ############ Function dc.coefficients() ######################## # # # dc.coefficients(cluster_table, new_feat, potential_cl) # # # # cluster_table = result of clustering function # # new_feat = features of new day # # potential_cl = result within function dc.coefficient.check # # # # Description: Function calculates the dissimiliarity coefficient of # # new day to existing day types. # # # # # ######################################################################### dc.coefficients <- function(cluster_data2, new_feat, potential_cl) { #clusters <- unique(cluster_table[,ncol(cluster_table)]) #cluster_data <- add.daytypes(cluster_names,cluster_data,holidays) new_day <- as.character(potential_cl[,2]) comb_old <- combn(unique(as.character(cluster_data2[,ncol(cluster_data2)]) ),2) comb_new <- expand.grid(unique(as.character(cluster_data2[,ncol(cluster_data2)]) ), new_day) #m <- ncol(features_change)-2 #new_feat$day_type <- potential_cl[,1] cl_day <- new_feat[,-1] data_rest <- cluster_data2[,-c(1,ncol(cluster_data2))] dc_hold <- as.vector(c()) res_dc <- data.frame(day1=character(0), day2=character(0)) for( i in 1:ncol(comb_old)) { day1 <- as.matrix(subset(data_rest, cluster_data2[,ncol(cluster_data2)] == comb_old[1,i])) day2 <- as.matrix(subset(data_rest, cluster_data2[,ncol(cluster_data2)] == comb_old[2,i])) library(fields) dc <- sum(rdist(day1, day2))/(NROW(day1)*NROW(day2)) dc_hold <- append(dc_hold, dc, after=i-1) res_dc <- rbind(res_dc, cbind(comb_old[1,i],comb_old[2,i])) } for( j in 1:NROW(comb_new)) { cl <- as.matrix(subset(data_rest, cluster_data2[,ncol(cluster_data2)] == as.character(comb_new[j,1]))) dc <- sum(rdist(cl_day, cl))/(NROW(cl_day)*NROW(cl)) dc_hold <- append(dc_hold, dc, after=(i+j)-1) res_dc <- rbind(res_dc, cbind(as.character(comb_new[j,1]), as.character(comb_new[j,2]))) } res_dc <- cbind(res_dc, dc_hold) names(res_dc) <- c("cl1","cl2","dc_value") dc_cluster <- res_dc } ############ Function compare.dc() ############################# # # # compare.dc(cluster1, cluster2, potential_cl) # # # # cluster1 = result of clustering function # # cluster2 = result of dc.coefficients # # potential_cl = result within function dc.coefficient.check # # # # Description: Compares the dissimiliarity coefficients of the new day # # to the different clusters and decides to which cluster # # the day should belong to. # # # # # ######################################################################### compare.dc <- function(cluster1, cluster2, potential_cl) { pot_cl_dc <- subset(cluster2, as.character(cluster2[,1])==potential_cl[,1] & as.character(cluster2[,2])==potential_cl[,2]) #subset giving dc values of new day to clusters dc_sub <- subset(cluster2, cluster2[,2] == as.character(potential_cl[,2])) minima <- dc_sub[which.min(dc_sub[,3]),] result <- minima } ############ Function predict.regression() ##################### # # # predict.regression(CP, data, lm_model) # # # # CP = result of multiple.regression() # # data = daily data plus new day # # lm_model = result of multiple.regression() # # # # Description: Computes high and T_a-CP column for new day. # # lm_model is applied to predict the target variable's # # consumption # # # ######################################################################### #lm_model <- regression_model #data <- regression_data predict.regression <- function(CP, data, lm_model) { #creating high column for regression data$high <- data$T_a >= CP #get Y a <- lm_model$call b <- as.list(a$formula) Y <- b[[2]] #Y <- "P_el" #newDay only newDay <- data[NROW(data),] #predict target variable predicted_values <- predict.lm(lm_model, data, se.fit=T) #calculate residuals, first set all neg values to 0 fitted_noNeg <- predicted_values$fit fitted_noNeg[fitted_noNeg<0] <- 0 fitted_noNeg <<- fitted_noNeg resid <- data[,names(data) == Y] - fitted_noNeg #create table with date for single.outlier() Time <- data[,1] resid_analyse <- data.frame(Time, resid) colnames(resid_analyse) <- c("Time","residuals") residuals_prediction <- resid_analyse residuals_prediction } ############ Function residual.outlier.test(data1, data2, Y) ### # # # residual.outlier.test(data1, data2) # # # # data1 = residuals of prediction with time column # # data2 = regression_data table # # Y = target variables name, after compile.data has been run# # Description: Runs a outlier test on the residuls of the prediction. # # Values are considered an outlier if they deviate more # # than 10% based on the maximum value # # # ######################################################################### residual.outlier.test <- function(data1, data2, Y) { #get column with Y in data2 Y_data <- data2[,names(data2)==Y] #get range of Y_data data_range <- max(Y_data)-min(Y_data) #assume 5% off is outlier limit <- data_range*0.15 data1 <- data1[!is.na(data1[,2]),] f <- strptime(data1[,1], format="%Y.%m.%d") time_column <- as.POSIXct(f) dataset <- as.numeric(data1[,2]) value <- as.vector(c()) date <- as.vector(c()) type <- as.vector(c()) class <- as.vector(c()) n <- length(dataset) mean_x <- as.numeric(0) for(i in 1:NROW(dataset)) { if(sd(dataset) == 0) { break print("residual.outlier.test: Standard deviation of variable tested for outlier is 0, all values are identical") } x_min <- min(dataset) a <- mean_x - x_min x_max <- max(dataset) b <- x_max - mean_x if(mean_x-min(dataset) > max(dataset)-mean_x) { d <- -a + mean_x #min d_row <- match(min(dataset), dataset) kind <- "min" diff_to_mean <- a }else{ d <- b + mean_x #max d_row <- match(max(dataset), dataset) kind <- "max" diff_to_mean <- b } if(limit < diff_to_mean) { class <- append(class, kind, after=i-1) type <- append(type, paste("residual of", Y), after=i-1) value <- append(value, dataset[d_row], after=i-1) date <-append(date, as.POSIXct(time_column[d_row]), after=i-1) dataset <- dataset[-d_row] time_column <- time_column[-d_row] n <- NROW(dataset) }else{ break } } outlier_found <- data.frame(date, value, type, class) outlier_found } ################################################################################## # # # carpet.plot(from, to, data, cs) -> graphs a carpet plot for specified # # timeframe # # # # # # - Required are packages "lattice" and "grid", by default they should # # be included in folder "library" and are loaded within carpet.plot # # - First column of data has to be Time, showing 1 hour # # measurements, format has to be "YY.mm.dd hh:mm" # # e.g "20.04.2008 00:00" # # - from <- "2007.11.22 00:00" has to start at midnight # # - to <- "2008.06.25 23:00" until 23th hour of day # # - cs has to be in the form: c(number, number,......,number) with # # number being the columns of the data table that should be graphed # # - name of graphed variable should be e.g: T_a_°C, will be used as # # graph description # # # # Function is called e.g. like this: # # # # carpet.plot("2007.11.22 00:00","2008.06.25 23:00", hourly_d, cs(2,3) # # # # Description: plots up to three carpet plots # # # ################################################################################## carpet.plot <- function(from, to, data, cs){ #Select time frame specified with "from" and "to" if(length(grep(" 00:00",from)) != 1) { print("selected start time doesn't begin at 00:00 o'clock") }else{ if(length(grep(" 23:00",to)) != 1) { print("selected end time doesn't stop at 23:00 o'clock") }else{ data <- data[c(grep(from,as.character(data[,1])):grep(to, as.character(data[,1]))),] #Extract the time format d <- strptime(data[,1], "%Y.%m.%d %H:%M") day_stamp <- format(d, "%Y.%m.%d") n <- length(cs) #Extract the amount of days t <- unique(day_stamp) days <- NROW(t) #Extract days as POSIXct format, is x-axis t.1 <- strptime(t,"%Y.%m.%d") x <- as.POSIXct(t.1) #b is amount of tick marks on x-axis b <- days/15 #Make color platte self.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000")) #loading necessary packages library(lattice) library(grid) #specifiying graphics device parameters if more than one graph per page grid.newpage() pushViewport(viewport(layout = grid.layout(n,1))) #Creating Matrix M for plotting, n is amount of columns specified with cs() for(i in 1:n){ M <- t(array(data[,cs[i]], dim=c(24,days))) #Specifiying position of graph if more than one plot per page pushViewport(viewport(layout.pos.col=1, layout.pos.row=i)) #actual graph command including color scheme, x-axis, labels print(levelplot(M,row.values = x, aspect="fill",contour=F, region=T, col.regions= self.colors(20), xlab=paste("Days showing", names(data)[cs[i]], sep=" "), ylab="Hours", scales=list( x=list( at = seq(min(x), max(x), by = paste(round(b,0), "day", sep =" ")), format="%d.%b")) ), newpage = F) #lifting Vieport 1 level up, necessary for subsequent graphs popViewport(1) } popViewport() }}} ############ Function dialog.Yes.NO() ########################## # # # dialog.Yes.NO(title, question) # # # # title = title of dialog box, in "title" # # question = question to which answer Yes/No belongs # # # # # # Description: Opens dialog box, Value returned is 1 for Yes, 2 for No # # and 3 if box has been closed by x button # # # # # ######################################################################### dialog.Yes.NO <- function(title, question) { library(tcltk) #Create new toplevel window tt <- tktoplevel() tktitle(tt) <- title #define variable that keeps track of state of window done <- tclVar(0) #create the buttons and assign values to them Yes.but <- tkbutton(tt, text=" Yes ",command=function()tclvalue(done)<-1) No.but <- tkbutton(tt, text=" No ",command=function()tclvalue(done)<-2) tkbind(tt, "", function() tclvalue(done)<-3) #window graphics tkgrid(tklabel(tt,text=" ")) tkgrid(tklabel(tt,text=question)) tkgrid(tklabel(tt,text=" ")) tkgrid(Yes.but) tkgrid.configure(Yes.but, sticky="nswe") tkgrid(No.but) tkgrid.configure(No.but, sticky="nswe") tkgrid(tklabel(tt,text=" ")) #place focus to window tkfocus(tt) tkwait.variable(done) #get value of variable, see what user has done doneval <- as.integer(tclvalue(done)) tkdestroy(tt) doneval } ############ Function new.feature.validation() ############################# # # # new.feature.validation(data1, new_feat_not_stand,holidays) # # # # data1 = data table called features_training_outl_del, unstandardised# # new_feat_not_stand = new feat if new not standrardised # # # # Description: Standardises each new feature of an incoming day # # # # # ############################################################################### #data1 <- features_training_outl_del new.feature.generation <- function(data1, new_feat_not_stand){ #get maxima of last 6 days into training period six_days <- data1[(NROW(data1)- 5):NROW(data1),] last_maxima <- sapply(six_days[,-1], function(a) max(a)) above_maxima <- last_maxima - new_feat_not_stand[,-1] neg_col <- which(above_maxima < 0) if(length(neg_col) != 0) { data1 <- rbind(data1, new_feat_not_stand) stand_features_training <- stand.features.training(data1, 7) cluster_data <<- stand_features_training[-NROW(stand_features_training),] new_feat <- stand_features_training[NROW(stand_features_training),] }else{ last_maxima <- sapply(last_maxima, function(a) replace(a, a==0, 0.1)) new_feat <- new_feat_not_stand[,-1]/last_maxima new_feat <- data.frame(Time=new_feat_not_stand[,1], new_feat) data1 <- rbind(data1, new_feat_not_stand) } features_training_outl_del <<- data1 new_feat } ############Function validation.daytype.known() ####################### # # # residual.outlier.test(new_day, new_feat, which_cluster, cluster_data, # # regression_data, regression_model) # # # # new_day = daily values of new day # # new_feat = features of new day # # which_cluster = result of dc.coefficient.check function # # cluster_data = result of clustering function # # regression data = data for the regression model # # regression_model = last multiple regrssion model # # # # # # Description: Validates the new day, only applicable for days that # # contain a known daytype # # # ######################################################################### validation.daytype.known <- function(new_day, new_feat, which_cluster, cluster_data, regression_data, regression_model,features_training_outl_del) { regCl_update <- 1 #necessary R commands for summary_days date_of_day <- new_day[,1] a <- regression_model$call b <- as.list(a$formula) Y <- as.character(b[[2]]) day <- format(as.POSIXct(strptime(date_of_day,"%Y.%m.%d")),"%a") measured <- new_day[,grep(Y,names(new_day))] actual_cluster <- which_cluster[1,3] #adds the day_type as last column to the new day add_day<- data.frame(new_day, day_types =as.character(which_cluster[1,3])) #add the new day to regression data regression_data <- rbind(regression_data,add_day) #add new feature to cluster_data cluster_data <- rbind(cluster_data, new_feat) #predict Y and check if outlier residuals_predicted <- predict.regression(CP, regression_data, regression_model) #necessary R commands for summary_days, result of predict.regression predicted <- fitted_noNeg[length(fitted_noNeg)] #Y <- "P_el" outliers_predict <- residual.outlier.test(residuals_predicted, regression_data, Y) #Statement True means outliers are found and stuff has to be done if(NROW(outliers_predict) != 0) { #Gives date of outlier outlier_dates <- format(outliers_predict[,1], "%Y.%m.%d") #checks if outlier date is same as new day's day_out <- which(outlier_dates %in% new_day[1,1]) #if TRUE, outlier is another day if(length(day_out) == 0) { #print("outlier isn't new day") outlier_found <- "FALSE" #Delete outlier from regression_data and cluster_data index1 <- sapply(outlier_dates, function(a) grep(a, regression_data[,1]) ) regression_data <- regression_data[-unique(index1), ] index2 <- sapply(outlier_dates, function(a) grep(a, cluster_data[,1]) ) index2<- as.integer(index2) index2<- index2[!is.na(index2)] if(length(index2) >0){cluster_data <- cluster_data[-unique(index2), ] features_training_outl_del <- features_training_outl_del[-unique(index2),]} }else{ #response <- dialog.Yes.NO("Outlier Detected", # paste(date_of_day,": Has change of operation occured?", # sep="")) #if(response == 1) #{ # summary_days <<- "FALSE" # i <<-i # break #} #no clustering and regression updates are neseccary, cuz new day is outlier index1 <- sapply(outlier_dates, function(a) grep(a,regression_data[,1]) ) regression_data <- regression_data[-c(index1,NROW(regression_data)),] index2 <- sapply(outlier_dates, function(a) grep(a,cluster_data[,1]) ) index2<- as.integer(index2) index2<- index2[!is.na(index2)] if(length(index2) >0){ cluster_data <- cluster_data[-c(index2,NROW(cluster_data)),] features_training_outl_del <- features_training_outl_del[-c(index2, NROW(features_training_outl_del)),] } outlier_found <- "TRUE" #if regression & Cluster update is set to 0, it won't take place below regCl_update <- 0 } } #update clustering and regression from TRAINING if ( regCl_update == 1) { #clusti <- cluster_data #result_features_out <- features.outlier(daily_training, clusti, # holidays,amount_outliers=1, method=2, multiplier_sd=1.9) #also available here: View(features_training_outl_del) # View(daily_training_outl_del) #cluster_data <- features_training_outl_del cluster <- clustering(cluster_data, holidays) #update day_types for regression regression_data <- add.daytypes(cluster_names, regression_data, holidays) regression_model <- multiple.regression(target_variable,regression_data) outlier_found <- "FALSE" } if(outlier_found == "TRUE") { adj.r.squared <- NA Change_point <- NA }else{ adj.r.squared <- adj.r.squared Change_point <- CP } features_training_outl_del <<- features_training_outl_del predicted_out <<- outliers_predict predicted_values <<- predicted CP <<- CP regression_model <<- regression_model cluster_data <<-cluster_data regression_data <<- regression_data summary_days <<- data.frame(date_of_day,day, actual_cluster=as.character(actual_cluster), Y=as.character(Y),measured,predicted,outlier_found, Change_point,adj.r.squared) } ############Function validation.daytype.unknown() ##################### # # # residual.outlier.test(new_day, new_feat, which_cluster, cluster_data, # # regression_data, regression_model) # # # # new_day = daily values of new day # # new_feat = features of new day # # which_cluster = result of dc.coefficient.check function # # cluster_data = result of clustering function # # regression data = data for the regression model # # regression_model = last multiple regrssion model # # # # # # Description: Validates the new day, only applicable for days that # # contain an unknown daytype # # # ######################################################################### validation.daytype.unknown <- function(new_day, new_feat, cluster_names, cluster_data, regression_data, regression_model) { date_of_day <- new_day[,1] response <- dialog.Yes.NO("Unknown Day Type",paste(date_of_day, ": Has change of operation occured?", sep="")) if(response == 1) { summary_days <<- "FALSE" i <<- i break } #necessary R commands for summary_days a <- regression_model$call b <- as.list(a$formula) Y <- as.character(b[[2]]) day <- format(as.POSIXct(strptime(date_of_day,"%Y.%m.%d")),"%a") measured <- new_day[,grep(Y,names(new_day))] day_belongs_cluster <- cluster_names[grep(day, cluster_names)] actual_cluster <- "unknown daytype" regCl_update = 1 add_day<- data.frame(new_day, day_types =as.character(day_belongs_cluster)) #add the new day to regression data regression_data <- rbind(regression_data,add_day) #add new feature to cluster_data cluster_data <- rbind(cluster_data, new_feat) #predict Y and check if outlier residuals_predicted <- predict.regression(CP, regression_data, regression_model) #necessary R commands for summary_days predicted <- fitted_noNeg[length(fitted_noNeg)] outlier_found <- "FALSE" outliers_predict <- residual.outlier.test(residuals_predicted, regression_data, Y) #Statement True means outliers are found and stuff has to be done if(NROW(outliers_predict) != 0) { #Gives date of outlier outlier_dates <- format(outliers_predict[,1], "%Y.%m.%d") #checks if outlier date is same as new day's day_out <- which(outlier_dates %in% new_day[1,1]) #if True unknown daytype day is not a outlier if(length(day_out) == 0) { #print("outlier isn't new day") outlier_found <- "FALSE" }else{ #no clustering and regression updates are neseccary, #cuz new day is outlier outlier_found <- "TRUE" regression_data <- regression_data[-NROW(regression_data),] cluster_data <- cluster_data[-NROW(cluster_data),] features_training_outl_del <- features_training_outl_del[-NROW(features_training_outl_del),] regCl_update <- 0 } } #if regression didn't find outlier, run outlier test on features # and update clustering and regression from TRAINING if ( regCl_update == 1) { #check stand_features for outliers and delete them from cluster_data #outliers_features_validation <- features.outlier(regression_data, # cluster_data, holidays) #cluster_data <- stand_features_training_outl_del #update cluster function cluster <- clustering(cluster_data, holidays) #update day_types for regression regression_data <- add.daytypes(cluster_names, regression_data, holidays) regression_model <- multiple.regression(target_variable,regression_data) } if(outlier_found == "TRUE") { adj.r.squared <- NA Change_point <- NA }else{ adj.r.squared <- adj.r.squared Change_point <- CP } predicted_out <<- outliers_predict predicted_values <<- predicted #feat_out_valid <<- outliers_features_validation CP <<- CP print(CP) regression_model <<- regression_model cluster_data <<-cluster_data regression_data <<- regression_data summary_days <<- data.frame(date_of_day,day,actual_cluster,Y, measured,predicted,outlier_found,Change_point,adj.r.squared) } ############ Function plot.regression.validation() ############# # # # plot.regression.training(data,holidays) -> # # # # data = result of validation process # # holidays = holiday vector # # # # Description: # # - plots the result of the validation # # # # # # # ######################################################################### #data <- summary_validation[25:NROW(summary_validation),] plot.regression.validation <- function(data,holidays){ #get target variable's name Y <- data[1,4] max1 <- max(data[,5]) max2 <- max(data[,6]) if(max1 > max2) {maximum <- max1 }else{ maximum <- max2 } #make vector for colouring for measured vectors vect_m <- data[,1:2] vect_m[,2] <- as.character(vect_m[,2]) list_holidays <- as.list(holidays) holidays_index <- lapply(list_holidays, function(t) match(t,vect_m[,1])) holidays_index <- as.numeric(holidays_index[!is.na(holidays_index)]) if(length(holidays_index) > 0) vect_m[holidays_index,2] <- 2 vect_m[vect_m[,2]=="Sat",2] <- 2 vect_m[vect_m[,2]=="Sun",2] <- 2 vect_m[vect_m[,2]!="2",2] <- 1 #make vector for colouring for predicted vectors vect_p <- data[,c(1,3)] vect_p[,2] <- as.character(vect_p[,2]) g <- strsplit(vect_p[,2], split=" ") index_p<-as.vector(0) for(i in 1:length(g)){ if(length(grep("Sun",g[[i]]))== 1 | length(grep("Sat",g[[i]]))==1 ) g[i] <- "red" if(length(grep("Holiday",g[[i]]))==1) g[i] <- "grey54" if(length(grep("Tue",g[[i]]))==1) g[i] <- "black" } vect_p <- data.frame(vect_p[,1], unlist(g)) #create time axis a <- as.POSIXct(strptime(data[,1], "%Y.%m.%d"), format="%Y-%m-%d") par(cex=1.3) par(cex=1.3, xpd=T) plot(a, data[,5], xaxt="n", pch=16, xlab="Time", ylab=paste("demand of ",Y," in W/m²"), col=vect_m[,2], ylim=c(0,(maximum+maximum*0.1)) ) lines(a, data[,5],xaxt="n", type="l", lwd=1.6) points(a, data[,6], xaxt="n", pch=1, col=as.character(vect_p[,2])) lines(a, data[,6], xaxt="n", type="l", lty="dotted") #outliers_subset <- subset(data[,c(1,5)], data[,7] == TRUE) #outliers_subset[,1] <- as.POSIXct(strptime(outliers_subset[,1], "%Y.%m.%d"), # format="%Y-%m-%d") #points(outliers_subset[,1], outliers_subset[,2], xaxt="n", pch=8, # col="blue", ps=4) axis.POSIXct(1, at = seq(min(a), max(a), by = "1 day"), format="%d.%m") grid(nx = NA, ny = NULL, col = "grey",lwd = par("lwd"), lty = "dotted") legend(x=min(a),y=maximum+maximum*0.37, ncol= 2,cex=0.95, x.intersp = 0.2, c("measured time series","fitted time series", "working days", "weekends", #"holidays" , "outliers manually added"), lty=c(1,2, 0,0,0), pch=c(16,1,16,16,NA,NA), col=c( "black","black","black","red","grey54", NA), box.lty=0) } #data <- summary_validation data2 <- data plot.regression.validation3daytypes <- function(data,holidays){ Y <- data[1,4] max1 <- max(data[,5]) max2 <- max(data[,6]) if(max1 > max2) {maximum <- max1 }else{ maximum <- max2 } #make vector for colouring for measured vectors vect_m <- data[,1:2] vect_m[,2] <- as.character(vect_m[,2]) list_holidays <- as.list(holidays) holidays_index <- lapply(list_holidays, function(t) match(t,vect_m[,1])) holidays_index <- as.numeric(holidays_index[!is.na(holidays_index)]) if(length(holidays_index) > 0) vect_m[holidays_index,2] <- 2 vect_m[vect_m[,2]=="Sat",2] <- "royalblue" vect_m[vect_m[,2]=="Sun",2] <- 2 vect_m[vect_m[,2]=="Mon",2] <- 1 vect_m[vect_m[,2]=="Tue",2] <- 1 vect_m[vect_m[,2]=="Wed",2] <- 1 vect_m[vect_m[,2]=="Thu",2] <- 1 vect_m[vect_m[,2]=="Fri",2] <- 1 #make vector for colouring for predicted vectors vect_p <- data[,c(1,3)] vect_p[,2] <- as.character(vect_p[,2]) g <- strsplit(vect_p[,2], split=" ") index_p<-as.vector(0) for(i in 1:length(g)){ if(length(grep("Sun",g[[i]]))== 1 | length(grep("Holiday",g[[i]]))==1 ) g[i] <- "red" if(length(grep("Sat",g[[i]]))==1 ) g[i] <- "royalblue" if(length(grep("Tue",g[[i]]))==1) g[i] <- "black" } vect_p <- data.frame(vect_p[,1], unlist(g)) #create time axis a <- as.POSIXct(strptime(data[,1], "%Y.%m.%d"), format="%Y-%m-%d") par(cex=1.3) #, xpd=TRUE) plot(a, data[,5], xaxt="n", pch=16, xlab="Time", ylab=paste("demand of ",Y," in W/m²"), col=vect_m[,2], ylim=c(0,(maximum+maximum*0.1)) ) lines(a, data[,5],xaxt="n", type="l", lwd=1.6) points(a, data[,6], xaxt="n", pch=1,col=vect_m[,2]) lines(a, data[,6], xaxt="n", type="l", lty="dotted") #outliers_subset <- subset(data[,c(1,5)], data[,7] == TRUE) #outliers_subset[,1] <- as.POSIXct(strptime(outliers_subset[,1], "%Y.%m.%d"), # format="%Y-%m-%d") #points(outliers_subset[,1], outliers_subset[,2], xaxt="n", pch=8, # col="blue", ps=4) axis.POSIXct(1, at = seq(min(a), max(a), by = "1 day"), format="%d.%m") grid(nx = NA, ny = 5, col = "grey",lwd = par("lwd"), lty = "dotted") legend(x=min(a),y=maximum+maximum*0.37, ncol= 2,cex=0.95, x.intersp = 0.2, c("measured time series","fitted time series", "working days", "sundays/holidays", "saturdays", "outliers in regression"), lty=c(1,2, 0,0,0,0), pch=c(16,1,16,16,16,NA), col=c( "black","black","black","red", 4,NA), box.lty=0) }