# Columns for data have to be named as follows, their sequence is optional, # except for "Time" which has to be in the first column # - Q_gas for gas demand in kWh # - Q_el for electricity demand in kWh # - gas for gas consumption in m³ # - H2O for water consumption in m³ # - T_i for indoor air temperature of a heated (winter) and cooled (summer) room # - T_i_heat for indoor air temperature of room that is heated only # - T_i_cool for indoor air temperature of room that is cooled only # - T_a ambient for outdoor air temperature # - Time for timestamp of day, either showing 15 min or 1 hour measurements, # format has to be "yyyy.mm.dd hh:mm" e.g "2008.04.26 00:15" # this has to be provided in the first column # ################################################################################## # # START OF TRAINING # # To be entered by user before skript is processed by R: # 1. target_variable Y for regression, Y = beta0 + beta1*X1...beta_n*Xn, # so far either "gas" or "electricity" or "district_heat" # 2. number_of_features wanted, 3(ave), 6(ave, max) or 9(ave, max, min) # 3. Time frame of data that should be compiled to daily values # in format "yyyy.mm.dd hh:mm" e.g "2008.04.24 00:15" # 4. list of public holidays # 5. the table containing all_data # 6. the net_area of the buidling # 7. energy which are used for cooling (dep_c) and heating (dep_h), if not applicable # put "NA" ################################################################################## rm(list=ls()) #source("D:\\skomhard\\Masterarbeit\\R_algorithm\\R\\R_funct_patchwork2.R") source("R_funct_patchwork2.R") #Target variable for regression target_variable <- "gas" #Großpösna #target_variable <- "water" #target_variable <- "district_heat" #thyssen #target_variable <- "electricity" #Numbers of features for daytyping numbers_of_features <- 6 #Training I period from_date <- "2008.02.17 00:00" #Großpösna to_date <- "2008.05.21 23:45" #Großpösna #from_date <- "2008.04.05 00:00" #Thyssen #to_date <- "2008.07.05 23:00" #Thyssen #Vector containing known holidays in Training period holidays <- c("2007.11.21","2007.12.25","2007.12.26","2008.01.01","2008.03.21","2008.03.23", "2008.03.24","2008.05.01","2008.05.11","2008.05.12","2008.05.22","2008.10.03","2008.10.31") # all_data <- read.csv(file.choose(), sep=";", header=T) all_data <- read.csv("Poesna_DP_071122_080809.csv", sep=";", header=T) all_data[5857:5952, 5] <- all_data[5857:5952, 5]*0.2+all_data[5857:5952, 5] all_data[6241:6336, 5] <- all_data[6145:6240, 5] #all_data[9530:9540, 4] <- 8 #all_data <- all_data[,-3] #Thyssen selected_data <- all_data[c(grep(from_date,as.character(all_data[,1])):grep(to_date, as.character(all_data[,1]))),] #create outlier for validation for pgas on 20.5 #selected_data[3293,3 ] <- 5 #selected_data[11869:11879, 4] <- 0.01 #for carpetplot, optional #d.x <- format(strptime(selected_data[,1],"%Y.%m.%d %H:%M"),"%Y.%m.%d %H") #hourly_selected_T <- aggregate(selected_data[,c(2,6,7)], by=list(d.x), FUN = mean) #hourly_selected_C <- aggregate(selected_data[,c(3,4,5)], by=list(d.x), FUN = sum) #hourly_selected <- cbind(hourly_selected_T,hourly_selected_C[,-1]) #hourly_selected[,1] <- format(strptime(hourly_selected[,1], "%Y.%m.%d %H"), "%Y.%m.%d %H:%M") #carpet.plot("2008.04.05 00:00" , "2008.08.09 23:00" , hourly_selected, c(5,6)) #net floor area of building net_area <- 436 #großpösna #net_area <- 19500 #Thyssen ################################################################################## # # Compile data to daily values # # Function used: compile_data(data, area) # # data: data table as described above, e.g. selected_data # area: net floor area of building in m², e.g. 740 # # to be specified by user outside of function: NA ################################################################################## #qnorm(1-0.1, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) #pnorm(3.2, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) #shapiro.test(dats) #as 0's cause numerical problems within the clustering, all 0 values are increased #by 1% of the difference between 0 and the next biggest value #selected_data <- increase.zeros(selected_data) #Fehler einbauen in selected data #selected_data[11943,5] <- 2 #selected_data[11947,5] <- 200 #selected_data[11948,5] <- 300 #selected_data[11949,5] <- 400 #selected_data[11940,5] <- 20 #selected_data[11942,5] <- 20 #selected_data[11941,5] <- 40 daily_all <- compile.data(all_data, net_area) daily_training <- compile.data(selected_data, net_area) #hourly_training <- selected_data #hourly_training is one day too long cuz of diff() in compile.data #Error for p_gas 30.5 and 12.6 von 0 auf 10W/m2 #daily_data[131,5] <- 10 #daily_data[144,5] <- 10 ################################################################################## # # test for outliers in variables # # Function used: outlier(data) # # data: data table resulting from calling compile.data # # to be specified by user outside of function: # - list of names of variables tested for outliers, e.g: # outlier_on <- c("T_a", "T_i", "T_i_heat", "T_i_cool") ################################################################################## #example for outlier.result #daily_training[50,2] <- 400 #daily_training[100,2] <- -1000 #daily_training[80,2] <- 700 #daily_training[70,6] <- -400 #daily_training[40,6] <- 300 #selecting which variables to be tested for outliers outlier_on <- c("T_a", "T_i", "T_i_heat", "T_i_cool") pos_outlier <-match(outlier_on, names(daily_training)) column_pos <-pos_outlier[!is.na(pos_outlier)] #testing variables for outliers and replacing them with NA result_outlier <- data.frame(date=character(0), value=character(0), type=character(0)) for(i in 1:length(column_pos)) { single_outlier <- single.outlier(daily_training[,c(1,column_pos[i])], 2, 3) result_outlier <- rbind(result_outlier, single_outlier)} if(NROW(result_outlier)>0) { #View(result_outlier) day <- strptime(daily_training[,1], "%Y.%m.%d") for(j in 1:NROW(result_outlier)){ row <- grep(result_outlier[j,1], day) column <- match(result_outlier[j,3], names(daily_training)) #daily_training[row,column] <- as.character("NA") daily_training[row,column] <- NA #daily_training[,column] <- as.numeric(daily_training[,column]) } }else{ "no outliers detected on single variables" } #for test reasons #write.table(summary_validation, "D:\\skomhard\\Masterarbeit\\R_algorithm\\R\\sumi1.R", # sep=";",dec=".", col.names=TRUE, row.names=FALSE) ################################################################################## # # Create 3 features (ave for heat, electricity, water) # # Function used: feature.vector3(data) # # data: data table resulting from calling compile.data # # to be specified by user outside of function: NA ################################################################################## #position of T_a T_a_pos <- match("T_a", names(daily_training)) #potential columns to generate features feature_consum <- c("Q_gas", "Q_el", "gas", "H2O", "Q_dh") #feature_consum <- c("H2O") pos_consum <-match(feature_consum, names(all_data)) column_pos <-pos_consum[!is.na(pos_consum)] #only consumption part can go into features that is not weather dependant #cor_T <- cor(daily_training[,column_pos],daily_training[,T_a_pos]) #out_rows <- which(abs(cor_T) > 0.5) #out_names <- row.names(cor_T)[2] #col_index <- match(out_names, names(daily_training)) #if(is.na(col_index) == FALSE){ col_select <- column_pos[-which(column_pos %in% col_index)] # }else{ # col_select <- column_pos # } feature_data_hourly <- all_data[,c(1,column_pos)] #Fehler rein #feature_data_hourly[11273,2] <- 12 features_all <- feature.vector6(feature_data_hourly) end_training_day <- strsplit(to_date, split=" ") end_training_day <- end_training_day[[1]][1] start_training_day <- strsplit(from_date, split=" ") start_training_day <- start_training_day[[1]][1] index_training_end <- grep(end_training_day, features_all[,1]) index_training_start <- grep(start_training_day, features_all[,1]) features_training <- features_all[(index_training_start+1):index_training_end,] ################################################################################## # # outlier test on transformed features and standardisation # # Function used: features.outlier(data), # stand.features.training(data, moving) # ################################################################################## #outlier test on features result_features_out <- features.outlier(daily_training, features_training, holidays, amount_outliers=1, method=2, multiplier_sd=3) #also available here: View(features_training_outl_del) # View(daily_training_outl_del) #daily_training1 <- daily_training_outl_del stand_features_training <- stand.features.training(features_training_outl_del, 7) #outlier test on standardised features result_features_out1 <- features.outlier(daily_training_outl_del,stand_features_training, holidays, amount_outliers=2, method=1, multiplier_sd=2.5) result_features_out <- rbind(result_features_out, result_features_out1) #result_features_out <- result_features_out1 stand_features_training <- features_training_outl_del #stand_features_training <- stand_features_training[-1,] #also available here is last_maxima ####################################################################################### # create cluster and make day_type column # Function used: clustering(data) # ####################################################################################### cluster <- clustering(stand_features_training, holidays) #last column of cluster_table gives list of day_types for each day day_types <- cluster_table[,ncol(cluster_table)] regression_data <- add.daytypes(cluster_names,daily_training_outl_del,holidays) #regression_data <- cbind(daily_training_outl_del, day_types) cluster_data <- stand_features_training ####################################################################################### # run regression and make optimized model # Function used: multiple.regression(data) # ####################################################################################### regression_model <- multiple.regression(target_variable,regression_data) #analysis of regression model, downweigh bad points if any are left #table_regression <- influence.measures(regression_model) #matrix_regression <- table_regression$infmat #influential_matrix <- table_regression$is.inf #points <- which(apply(table_regression$is.inf, 1,any)) #plot.regression.training(CP, regression_model, daily_training, # result_features_out, cluster_names) ####################################################################################### # # END OF TRAINING # # START OF VALIDATION 1 # ####################################################################################### #data table with daily values and outliers: daily_training #data table with standardised features and outliers: stand_features_training #data table with daily values no outliers: daily_training_outl_del #data table with standardised features no outliers: stand_features_training_outl_del #data table with all available hourly/15min data: all_data #data table with all available compiled daily: daily_all #data table with all standardized features: stand_features_all #end_training_day #date of last day of training, daily summary_validation <- data.frame(date_of_day=character(0), day=character(0), actual_cluster=character(0), Y=character(0), measured=numeric(0), predicted=numeric(0), outlier_found=logical(0), Change_point=numeric(0),adj.r.squared=numeric(0) ) all_outlier <- data.frame(date=character(0), value_predicted =numeric(0), type=character(0), class=character(0)) new_features <- data.frame(Time=character(0), numeric(0),numeric(0),numeric(0),numeric(0),numeric(0),numeric(0)) clusters_found <- data.frame(Date=character(0), day=character(0), cluster=character(0)) #create date series if Validation is before training, also standardize all features day_series <- daily_all[1:(index_training_start-1),1] stand_features_all<- stand.features.training(features_all, 7) #create date series for validation #index_valid_start <- grep(end_training_day, daily_all[,1]) #day_series <- daily_all[(index_valid_start+1):NROW(daily_all),1] for(i in 23:87) #for(i in 1:NROW(day_series)) { new_day <- daily_all[grep(day_series[i],daily_all[,1]), ] new_feat <- stand_features_all[grep(day_series[i],stand_features_all[,1]), ] #new_feat <- new.feature.generation(features_training_outl_del, new_feat_not_stand,holidays) which_cluster <- dc.coefficient.check(new_feat, cluster_names, cluster_data) clusters_found <- rbind(clusters_found, which_cluster) result_validation <- validation.daytype.known(new_day, new_feat, which_cluster, cluster_data,regression_data, regression_model, features_training_outl_del) summary_validation <- rbind(summary_validation, result_validation) } if(which_cluster[1,3] != "unknown daytype") { #run validation for known daytype result_validation <- validation.daytype.known(new_day, new_feat, which_cluster, cluster_data,regression_data, regression_model, features_training_outl_del) #if(summary_days == "FALSE") break }else{ result_validation <- validation.daytype.unknown(new_day, new_feat, cluster_names, cluster_data, regression_data, regression_model) } summary_validation <- rbind(summary_validation, result_validation) #if(summary_days == "FALSE") break } # plot.regression.validation(summary_validation[25:87,],holidays) plot.regression.validation(summary_validation[1:24,],holidays)