In the following you will find a simple data mining demo that shows modelling and forecasting in the connection of R and Exasol. If you have not performed the installation and configuration of the necessary components yet, you can find an instruction here.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# Laden der benötigten Pakete library(RODBC) library(exasol) library(rredis) library(magrittr) library(stringi) library(rpart) library(partykit) # Verbindung zu Exasol und Redis herstellen con <- odbcConnect("exasol_vm") redisConnect("172.20.248.13") # Erstellen eines Zufallszahlen-Vektors für die Stichprobenziehung rnd <- rnorm(nrow(iris)) # Anfügen der Gruppenvariable (Training/Validierung) iris$groups <- factor(NA, levels = c("Train", "Valid")) # Nach Spezies geschichtete Zufallsziehung: 70% Training, 30% Validierung for(i in unique(iris$Species)) { logVec <- iris$Species == i iris$groups[logVec] <- ifelse(test = rnd[logVec] > quantile(rnd[logVec], probs = 0.3), yes = "Train", no = "Valid") } # Überprüfung der Stichprobenziehung table(iris$groups, iris$Species) # Workspace aufräumen rm(rnd, logVec, i) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
# Datenbankschema mit dem Namen <em>my_schema</em> erstellen odbcQuery(con, "create schema my_schema") # Erstellen der leeren Tabelle unter dem Namen <em>irisdb</em> odbcQuery(con, "create or replace table my_schema.irisdb( SepalLength DOUBLE, SepalWidth DOUBLE, PetalLength DOUBLE, PetalWidth DOUBLE, Species CHAR(20), Groups CHAR(20));") # Hochladen der <em>iris</em> Daten in die Exasol exa.writeData(con, data = iris, tableName = "my_schema.irisdb") |
1 2 3 4 5 6 7 8 9 10 11 12 13 |
# Erstellen eines Trainings- und eines Validierungsdatensatzes train <- subset(iris, subset = groups == "Train", select = -groups) valid <- subset(iris, subset = groups == "Valid", select = -groups) # Erstellen des Entscheidungsbaums mit den Trainingsdaten localTree <- rpart(Species ~ ., data = train) # Visualisierung des Baums plot(as.party(localTree)) # Prognose Validierungsdaten mit Hilfe des Baums pred <- predict(localTree, type = "class", newdata = valid) # Prognose überprüfen table(pred, valid$Species, dnn = c("Vorhersage", "Tatsächlich")) %&gt;% addmargins() |
The model built in the Exasol cluster will be stored in Redis, a key-value database. With Redis you can distribute models, functions and other R objects in the cluster and load them from the cluster.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
exa_rf <- exa.createScript( con, "my_schmea.exa_rf", # Unter diesem Namen ist das R-Script über SQL verfügbar function(data) { # Laden der benötigten Pakete. Diese müssen ggf. in der Exasol installiert sein require(rpart) require(stringi) require(rredis) # Verbindung mit Redis redisConnect("172.20.248.13", port = 6379) # Laden aller Daten aus der Exasol Tabelle # wird im Funktionsaufruf das <em>groupBy</em> oder <em>where</em> Argument verwendet, # wird nur der entsprechende Teil der Daten geladen. data$next_row(NA) # Wandeln des <em>data</em> Objekts in einen <em>data.frame</em> df <- data.frame(v1 = data$SepalLength, v2 = data$SepalWidth, v3 = data$PetalLength, v4 = data$PetalWidth, species = data$Species) # Aufbereiten des data.frames df$species <- stri_replace_all_fixed(df$species, " ", "") df$species <- as.factor(df$species) # Erstellen des Baums rf <- rpart(species ~ ., data = df) # Speichern des Baums in Redis redisSet("exa_rf", rf) # Rückgabe der Zeilenanzahl (zur Kontrolle) data$emit(nrow(df)) }, inArgs = c("SepalLength DOUBLE", "SepalWidth DOUBLE", "PetalLength DOUBLE", "PetalWidth DOUBLE", "Species CHAR(20)"), outArgs = "Feedback INT") # Aufrufen der oben gebildeten Funktion. Das where Argument legt fest, dass # das Modell auf den Trainingsdaten gebildet wird. exa_rf("SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species", table = "my_schema.irisdb", where = "groups = 'Train'") |
The tree model can be used for forecasting in a separate step on the Exasol. The first part is mostly identical with the above function.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
exa_predict_rf <- exa.createScript( con, "my_schema.exa_pred", function(data) { require(rpart) require(rredis) redisConnect("172.20.248.13", port = 6379) data$next_row(NA) df <- data.frame(v1 = data$SepalLength, v2 = data$SepalWidth, v3 = data$PetalLength, v4 = data$PetalWidth, species = data$Species) # Laden des Baum Modells aus Redis rf <- redisGet("exa_rf") # Erstellen der Prognose pred <- predict(rf, newdata = df, type = "class") # Rückgabe der Prognose sowie der echten Klassenzugehörigkeit data$emit(pred, df$species) }, inArgs = c( "SepalLength DOUBLE", "SepalWidth DOUBLE", "PetalLength DOUBLE", "PetalWidth DOUBLE", "Species CHAR(20)"), outArgs = c("Prognose CHAR(20)", "Realwerte CHAR(20)")) # Aufruf der oben gebildeten Funktion. Die Rückgabe wird in ein Objekt gespeichert. exa_pred <- exa_predict_rf("SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species", table = "my_schema.irisdb", where = "groups = 'Valid'") # Mit Hilfe der Table Funktion kann geprüft werden, wie gut die Prognose performt. table(exa_pred$PROGNOSE, exa_pred$REALWERTE, dnn = c("Prognose", "Realwerte")) %&gt;% addmargins() |
In the following you will find a simple data mining demo that shows modelling and forecasting in the connection of R and Exasol. If you have not performed the installation and configuration of the necessary components yet, you can find an instruction here.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# Load the required packages library(RODBC) library(exasol) library(rredis) library(magrittr) library(stringi) library(rpart) library(partykit) # Connection to Exasol and Redis con <- odbcConnect("exasol_vm") redisConnect("172.20.248.13") # Random numbers vector for the sampling rnd <- rnorm(nrow(iris)) # Add the group variable (Training/Validierung) iris$groups <- factor(NA, levels = c("Train", "Valid")) # Random draw: 70% Training, 30% Validation for(i in unique(iris$Species)) { logVec <- iris$Species == i iris$groups[logVec] <- ifelse(test = rnd[logVec] > quantile(rnd[logVec], probs = 0.3), yes = "Train", no = "Valid") } # Review of the sampling table(iris$groups, iris$Species) # Clean up the Workspace rm(rnd, logVec, i) |
The iris data are now transferred to the Exasol. At first use, a database schema and an empty table are created. Afterwards, the iris data are written into the table. Once the data are uploaded they can be used time and again.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
# Create a Database scheme with the name <em>my_schema</em> odbcQuery(con, "create schema my_schema") # Create an empty table with the name <em>irisdb</em> odbcQuery(con, "create or replace table my_schema.irisdb( SepalLength DOUBLE, SepalWidth DOUBLE, PetalLength DOUBLE, PetalWidth DOUBLE, Species CHAR(20), Groups CHAR(20));") # Upload of the <em>iris</em> Data in the Exasol exa.writeData(con, data = iris, tableName = "my_schema.irisdb") |
In the first step of the analysis a decision tree is created locally. Based on this tree a forecast is done.
1 2 3 4 5 6 7 8 9 10 11 12 13 |
# Creating two data sets train <- subset(iris, subset = groups == "Train", select = -groups) valid <- subset(iris, subset = groups == "Valid", select = -groups) # Decission tree localTree <- rpart(Species ~ ., data = train) # Visualisation of the tree plot(as.party(localTree)) # Forecast pred <- predict(localTree, type = "class", newdata = valid) # Check forecast table(pred, valid$Species, dnn = c("Vorhersage", "Tatsächlich")) %&gt;% addmargins() |
The same procedure as above now in the Exasol. The exa.script function creates an R script on the Exasol server. The function call runs the script on the Exasol clusters. This means that the analysis no longer takes place on the local R but on the R instances in the Exasol cluster. Packages which are used on the R instances on the Exasol need to be installed there. Have a look at this blog entry for more information.
The model built in the Exasol cluster will be stored in Redis, a key-value database. With Redis you can distribute models, functions and other R objects in the cluster and load them from the cluster.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
exa_rf <- exa.createScript( con, "my_schmea.exa_rf", # Unter diesem Namen ist das R-Script über SQL verfügbar function(data) { # Load the required packages require(rpart) require(stringi) require(rredis) # Connection with Redis redisConnect("172.20.248.13", port = 6379) # Loading all data from the Exasol table data$next_row(NA) # Transform the <em>data</em> Object into <em>data.frame</em> df <- data.frame(v1 = data$SepalLength, v2 = data$SepalWidth, v3 = data$PetalLength, v4 = data$PetalWidth, species = data$Species) # Prepare the data.frames df$species <- stri_replace_all_fixed(df$species, " ", "") df$species <- as.factor(df$species) # Generate a tree rf <- rpart(species ~ ., data = df) # Save the tree in Redis redisSet("exa_rf", rf) # Return the line number data$emit(nrow(df)) }, inArgs = c("SepalLength DOUBLE", "SepalWidth DOUBLE", "PetalLength DOUBLE", "PetalWidth DOUBLE", "Species CHAR(20)"), outArgs = "Feedback INT") # Call of the function. exa_rf("SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species", table = "my_schema.irisdb", where = "groups = 'Train'") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
exa_predict_rf <- exa.createScript( con, "my_schema.exa_pred", function(data) { require(rpart) require(rredis) redisConnect("172.20.248.13", port = 6379) data$next_row(NA) df <- data.frame(v1 = data$SepalLength, v2 = data$SepalWidth, v3 = data$PetalLength, v4 = data$PetalWidth, species = data$Species) # Loading the tree model out of Redis rf <- redisGet("exa_rf") # Creating a Forecast pred <- predict(rf, newdata = df, type = "class") # Return of the forecast data$emit(pred, df$species) }, inArgs = c( "SepalLength DOUBLE", "SepalWidth DOUBLE", "PetalLength DOUBLE", "PetalWidth DOUBLE", "Species CHAR(20)"), outArgs = c("Prognose CHAR(20)", "Realwerte CHAR(20)")) # Call of the function. The return is stored in an object. exa_pred <- exa_predict_rf("SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species", table = "my_schema.irisdb", where = "groups = 'Valid'") # Performance testing table(exa_pred$PROGNOSE, exa_pred$REALWERTE, dnn = c("Prognose", "Realwerte")) %&gt;% addmargins() |
eoda GmbH - Posted on 20.06.2016
Als Data Science Spezialisten sind wir Ihr Ansprechpartner im Umfeld von Big Data, Machine Learning und Künstlicher Intelligenz. Wir unterstützen Sie ganzheitlich – von der Identifikation des richtigen Anwendungsfalls über die Datenanalyse und Interpretation der Ergebnisse bis hin zur Implementierung der entwickelten Lösung in Ihr Produktivsystem.