sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Environment

R version

R.Version()$version.string 
## [1] "R version 3.4.4 (2018-03-15)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE) 
## Loading required package: magrittr
## Loading required package: DT
## Loading required package: ggplot2
## Loading required package: jsonlite
## Loading required package: methods
## Loading required package: parallel
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following object is masked from 'package:base':
## 
##     date
timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 1.25 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
  if(format == "rds"){saveRDS(data, getSourcePath(filename, baseFolder))}
  if(format == "fst"){fst::write_fst(data, getSourcePath(filename, baseFolder))}
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data, anonymize=TRUE){

    if(anonymize)return()

    columnsName <- names(data)
    #columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

cleanJsonId <- function(txt){
    txt <- gsub("\\.json", "", txt)
    gsub(".*/", "", txt)
}

embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){

  # Create encoded Base64 datastream 
  encode_data= function(x){
    saveMe <- getSourcePath("file.csv")
    write.csv2(x, saveMe)
    enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
    unlink(saveMe)
    return(enc)
  }

  # String result ready to be placed in rmarkdown
  paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")

}

###########################################################################################################

Libraries version

if(exists("libraries")){
    data.table(library = libraries, version = librariesVersion)
}
includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  methods   stats     graphics  grDevices utils     datasets 
## [8] base     
## 
## other attached packages:
## [1] lubridate_1.7.4   jsonlite_1.6      ggplot2_3.2.1     DT_0.10          
## [5] magrittr_1.5      rmarkdown_2.1     data.table_1.12.6
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3        compiler_3.4.4    pillar_1.4.3      highr_0.8        
##  [5] tools_3.4.4       digest_0.6.22     evaluate_0.14     lifecycle_0.2.0  
##  [9] tibble_3.0.1      gtable_0.3.0      pkgconfig_2.0.3   rlang_0.4.5      
## [13] yaml_2.2.0        xfun_0.11         withr_2.1.2       stringr_1.4.0    
## [17] dplyr_0.8.3       knitr_1.26        htmlwidgets_1.5.1 vctrs_0.2.4      
## [21] grid_3.4.4        tidyselect_0.2.5  glue_1.3.1        R6_2.4.1         
## [25] purrr_0.3.3       scales_1.1.0      htmltools_0.4.0   ellipsis_0.3.0   
## [29] assertthat_0.2.1  colorspace_1.4-1  stringi_1.4.3     lazyeval_0.2.2   
## [33] munsell_0.5.0     crayon_1.3.4
timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.1 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Computer and Language data set

After the phase of exploratory data analysis the following fields are identified as meaningful variables concerning the computer and language (foreign language) skills of users.

The following fields are measured,

fieldsMeasure <- c(
    "SkillsPassport.Locale", 
    "Skills.Computer.Certificate.Title",
    "Skills.Computer.Description",
    "Skills.Computer.ProficiencyLevel.Information",
    "Skills.Computer.ProficiencyLevel.Communication",
    "Skills.Computer.ProficiencyLevel.ContentCreation",
    "Skills.Computer.ProficiencyLevel.ProblemSolving",
    "Skills.Computer.ProficiencyLevel.Safety",
    "Skills.Linguistic.ForeignLanguage.Description.Code",
    "Skills.Linguistic.ForeignLanguage.Description.Label",
    "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Listening",
    "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Reading",
    "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.SpokenProduction",
    "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Writing"
)

fieldsMeasure
##  [1] "SkillsPassport.Locale"                                              
##  [2] "Skills.Computer.Certificate.Title"                                  
##  [3] "Skills.Computer.Description"                                        
##  [4] "Skills.Computer.ProficiencyLevel.Information"                       
##  [5] "Skills.Computer.ProficiencyLevel.Communication"                     
##  [6] "Skills.Computer.ProficiencyLevel.ContentCreation"                   
##  [7] "Skills.Computer.ProficiencyLevel.ProblemSolving"                    
##  [8] "Skills.Computer.ProficiencyLevel.Safety"                            
##  [9] "Skills.Linguistic.ForeignLanguage.Description.Code"                 
## [10] "Skills.Linguistic.ForeignLanguage.Description.Label"                
## [11] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Listening"       
## [12] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Reading"         
## [13] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.SpokenProduction"
## [14] "Skills.Linguistic.ForeignLanguage.ProficiencyLevel.Writing"
  • Read unique CVs

The JSON data are parsed in batches and the relevant fields are measured. For each batch a tabular dataset is calculated and saved in binary form.

chunkRepo <- getSourcePath("jobsOutput/skillsDesChunks/")
unlink(list.files(chunkRepo, full.names = TRUE), recursive = TRUE)
filesToAnalyseData <- readRDS(file = getSourcePath("jobsOutput/jsonFilesToAnalyse.rds"))
filesToAnalyse <- filesToAnalyseData$jsonFile
json_files_split <- split(filesToAnalyse, 1:20)

v_grepl <- Vectorize(grepl, vectorize.args = "pattern")

no_cores <- detectCores() - 1 
cl <- makeCluster(no_cores, type = "FORK")
deafen <- parLapply(cl, json_files_split, function(json_chunk){
  json_DT <- lapply(json_chunk, function(x){
    jsonList <- fromJSON(x, flatten = TRUE)
    jsonVec <- unlist(jsonList, recursive = TRUE)
    existFields <- v_grepl(fieldsMeasure, names(jsonVec)) 
    existFields <- apply(existFields, 1, function(x)sum(x) != 0)
    resultVec <- jsonVec[existFields]
    if(length(resultVec) == 0)return(NULL)
    result <- resultVec %>% data.table %>% transpose
    if(nrow(result) == 0)return(NULL)
    setnames(result, names(resultVec))
    result[ , id := x]
  }) %>% rbindlist(fill = TRUE)
  uniqueNum <- as.numeric(Sys.time())
  saveRDS(object = json_DT, file = paste0(chunkRepo, "computer", uniqueNum, ".rds"))
}) 
stopCluster(cl)

#binding chunks
dataChunks <- list.files(path = chunkRepo, full.names = TRUE)
mergeChunksDT <- lapply(dataChunks, function(x){
  readRDS(x)
}) %>% rbindlist(fill = TRUE, use.names = TRUE)
  • Keep variables with more than 0.1% completion.
missingPerCol <- sapply(mergeChunksDT, function(x){
  sumNa <- sum(is.na(x))
  sumNa/length(x)
})
mergeChunksDT <- mergeChunksDT[, missingPerCol < 0.999, with = FALSE]
setnames(mergeChunksDT, "SkillsPassport.Locale", "locale")

## Computer Skills Data

computerDT <- mergeChunksDT[, !grepl("ForeignLanguage", names(mergeChunksDT)), with = FALSE]
  • Cast table in long range format.
meltComputer <- melt(computerDT, id.vars = c("id", "locale"), na.rm = TRUE)
meltComputer[, variable := gsub("SkillsPassport.LearnerInfo.Skills.Computer.", "", variable)]
meltComputer[, id := cleanJsonId(id)]

## Foreign Language Data

foreignDT <- mergeChunksDT[, !grepl("Computer", names(mergeChunksDT)), with = FALSE]
  • Cast table in long range format.
meltForeign <- melt(foreignDT, id.vars = c("id", "locale"), na.rm = TRUE)
meltForeign[, variable := gsub("SkillsPassport.LearnerInfo.Skills.Linguistic.ForeignLanguage.", "", variable)]
meltForeign[, id := cleanJsonId(id)]
meltForeign[, num := gsub("[^\\d]+", "", variable, perl=TRUE)]
meltForeign[, variable := gsub("[[:digit:]]", "", variable)]
foreignData <- dcast(meltForeign, id+locale+num ~ variable)[, num := NULL]

Save data

  • Save computer related data
data <- meltComputer[!duplicated(meltComputer)]
saveRDS(data, getSourcePath("jobsOutput/skillsComputerDT.rds"))
summariseTable(data)
  • Save foreign language data
data <- foreignData[!duplicated(foreignData)]
saveRDS(data, getSourcePath("jobsOutput/skillsForeignLanguageDT.rds"))
summariseTable(data)
timeNeeded <- (proc.time()[3] -  source.starting.time);

01.computer.R completed in 2883.41 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 2884.75 seconds.

Subpart metrics

reportTabularData(source.blocks);
## NULL

End of report

Reports index

An Eworx S.A. DSENSE report for Europass.

– end of report –