Abstract
This report runs cleansing scripts that generate a list of meaningful data sets that can be used for statistical analysis, visualisations and machine learning applications.sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]
R.Version()$version.string
## [1] "R version 3.4.4 (2018-03-15)"
librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: magrittr
## Loading required package: DT
## Loading required package: ggplot2
## Loading required package: jsonlite
## Loading required package: methods
## Loading required package: parallel
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:base':
##
## date
timeNeeded <- (proc.time()[3] - source.starting.time);
../000.core/00.01.libraries.R completed in 1.25 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
if(format == "rds"){saveRDS(data, getSourcePath(filename, baseFolder))}
if(format == "fst"){fst::write_fst(data, getSourcePath(filename, baseFolder))}
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data, anonymize=TRUE){
if(anonymize)return()
columnsName <- names(data)
#columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
cleanJsonId <- function(txt){
txt <- gsub("\\.json", "", txt)
gsub(".*/", "", txt)
}
embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){
# Create encoded Base64 datastream
encode_data= function(x){
saveMe <- getSourcePath("file.csv")
write.csv2(x, saveMe)
enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
unlink(saveMe)
return(enc)
}
# String result ready to be placed in rmarkdown
paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")
}
###########################################################################################################
if(exists("libraries")){
data.table(library = libraries, version = librariesVersion)
}
includeCssPublished <- publishIncludeCss()
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel methods stats graphics grDevices utils datasets
## [8] base
##
## other attached packages:
## [1] lubridate_1.7.4 jsonlite_1.6 ggplot2_3.2.1 DT_0.10
## [5] magrittr_1.5 rmarkdown_2.1 data.table_1.12.6
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 compiler_3.4.4 pillar_1.4.3 highr_0.8
## [5] tools_3.4.4 digest_0.6.22 evaluate_0.14 lifecycle_0.2.0
## [9] tibble_3.0.1 gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.5
## [13] yaml_2.2.0 xfun_0.11 withr_2.1.2 stringr_1.4.0
## [17] dplyr_0.8.3 knitr_1.26 htmlwidgets_1.5.1 vctrs_0.2.4
## [21] grid_3.4.4 tidyselect_0.2.5 glue_1.3.1 R6_2.4.1
## [25] purrr_0.3.3 scales_1.1.0 htmltools_0.4.0 ellipsis_0.3.0
## [29] assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.3 lazyeval_0.2.2
## [33] munsell_0.5.0 crayon_1.3.4
timeNeeded <- (proc.time()[3] - source.starting.time);
../000.core/00.02.base.functions.R completed in 0.1 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
After the phase of exploratory data analysis the following fields are identified as meaningful variables concerning the computer and language (foreign language) skills of users.
The following fields are measured,
fieldsMeasure <- c(
"SkillsPassport.Locale",
"Skills.Computer.Certificate.Title",
"Skills.Computer.Description",
"Skills.Computer.ProficiencyLevel.Information",
"Skills.Computer.ProficiencyLevel.Communication",
"Skills.Computer.ProficiencyLevel.ContentCreation",
"Skills.Computer.ProficiencyLevel.ProblemSolving",
"Skills.Computer.ProficiencyLevel.Safety",
"Skills.Linguistic.ForeignLanguage.Description.Code",
"Skills.Linguistic.ForeignLanguage.Description.Label",
"Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Listening",
"Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Reading",
"Skills.Linguistic.ForeignLanguage.ProficiencyLevel.SpokenProduction",
"Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Writing"
)
fieldsMeasure
## [1] "SkillsPassport.Locale"
## [2] "Skills.Computer.Certificate.Title"
## [3] "Skills.Computer.Description"
## [4] "Skills.Computer.ProficiencyLevel.Information"
## [5] "Skills.Computer.ProficiencyLevel.Communication"
## [6] "Skills.Computer.ProficiencyLevel.ContentCreation"
## [7] "Skills.Computer.ProficiencyLevel.ProblemSolving"
## [8] "Skills.Computer.ProficiencyLevel.Safety"
## [9] "Skills.Linguistic.ForeignLanguage.Description.Code"
## [10] "Skills.Linguistic.ForeignLanguage.Description.Label"
## [11] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Listening"
## [12] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Reading"
## [13] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.SpokenProduction"
## [14] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Writing"
The JSON data are parsed in batches and the relevant fields are measured. For each batch a tabular dataset is calculated and saved in binary form.
chunkRepo <- getSourcePath("jobsOutput/skillsDesChunks/")
unlink(list.files(chunkRepo, full.names = TRUE), recursive = TRUE)
filesToAnalyseData <- readRDS(file = getSourcePath("jobsOutput/jsonFilesToAnalyse.rds"))
filesToAnalyse <- filesToAnalyseData$jsonFile
json_files_split <- split(filesToAnalyse, 1:20)
v_grepl <- Vectorize(grepl, vectorize.args = "pattern")
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type = "FORK")
deafen <- parLapply(cl, json_files_split, function(json_chunk){
json_DT <- lapply(json_chunk, function(x){
jsonList <- fromJSON(x, flatten = TRUE)
jsonVec <- unlist(jsonList, recursive = TRUE)
existFields <- v_grepl(fieldsMeasure, names(jsonVec))
existFields <- apply(existFields, 1, function(x)sum(x) != 0)
resultVec <- jsonVec[existFields]
if(length(resultVec) == 0)return(NULL)
result <- resultVec %>% data.table %>% transpose
if(nrow(result) == 0)return(NULL)
setnames(result, names(resultVec))
result[ , id := x]
}) %>% rbindlist(fill = TRUE)
uniqueNum <- as.numeric(Sys.time())
saveRDS(object = json_DT, file = paste0(chunkRepo, "computer", uniqueNum, ".rds"))
})
stopCluster(cl)
#binding chunks
dataChunks <- list.files(path = chunkRepo, full.names = TRUE)
mergeChunksDT <- lapply(dataChunks, function(x){
readRDS(x)
}) %>% rbindlist(fill = TRUE, use.names = TRUE)
missingPerCol <- sapply(mergeChunksDT, function(x){
sumNa <- sum(is.na(x))
sumNa/length(x)
})
mergeChunksDT <- mergeChunksDT[, missingPerCol < 0.999, with = FALSE]
setnames(mergeChunksDT, "SkillsPassport.Locale", "locale")
## Computer Skills Data
computerDT <- mergeChunksDT[, !grepl("ForeignLanguage", names(mergeChunksDT)), with = FALSE]
meltComputer <- melt(computerDT, id.vars = c("id", "locale"), na.rm = TRUE)
meltComputer[, variable := gsub("SkillsPassport.LearnerInfo.Skills.Computer.", "", variable)]
meltComputer[, id := cleanJsonId(id)]
## Foreign Language Data
foreignDT <- mergeChunksDT[, !grepl("Computer", names(mergeChunksDT)), with = FALSE]
meltForeign <- melt(foreignDT, id.vars = c("id", "locale"), na.rm = TRUE)
meltForeign[, variable := gsub("SkillsPassport.LearnerInfo.Skills.Linguistic.ForeignLanguage.", "", variable)]
meltForeign[, id := cleanJsonId(id)]
meltForeign[, num := gsub("[^\\d]+", "", variable, perl=TRUE)]
meltForeign[, variable := gsub("[[:digit:]]", "", variable)]
foreignData <- dcast(meltForeign, id+locale+num ~ variable)[, num := NULL]
data <- meltComputer[!duplicated(meltComputer)]
saveRDS(data, getSourcePath("jobsOutput/skillsComputerDT.rds"))
summariseTable(data)
data <- foreignData[!duplicated(foreignData)]
saveRDS(data, getSourcePath("jobsOutput/skillsForeignLanguageDT.rds"))
summariseTable(data)
timeNeeded <- (proc.time()[3] - source.starting.time);
01.computer.R completed in 2883.41 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.blocks$sourceTimeNeeded <- sourceTimeNeeded;
Completed in 2884.75 seconds.
reportTabularData(source.blocks);
## NULL