sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Environment

R version

R.Version()$version.string

## [1] "R version 3.4.4 (2018-03-15)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: magrittr

## Loading required package: DT

## Loading required package: ggplot2

## Loading required package: jsonlite

## Loading required package: methods

## Loading required package: parallel

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following object is masked from 'package:base':
## 
##     date

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 1.21 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
  if(format == "rds"){saveRDS(data, getSourcePath(filename, baseFolder))}
  if(format == "fst"){fst::write_fst(data, getSourcePath(filename, baseFolder))}
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data, anonymize=TRUE){

    if(anonymize)return()

    columnsName <- names(data)
    #columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

cleanJsonId <- function(txt){
    txt <- gsub("\\.json", "", txt)
    gsub(".*/", "", txt)
}

embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){

  # Create encoded Base64 datastream 
  encode_data= function(x){
    saveMe <- getSourcePath("file.csv")
    write.csv2(x, saveMe)
    enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
    unlink(saveMe)
    return(enc)
  }

  # String result ready to be placed in rmarkdown
  paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")

}

###########################################################################################################

Libraries version

if(exists("libraries")){
    data.table(library = libraries, version = librariesVersion)
}

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  methods   stats     graphics  grDevices utils     datasets 
## [8] base     
## 
## other attached packages:
## [1] lubridate_1.7.4   jsonlite_1.6      ggplot2_3.2.1     DT_0.10          
## [5] magrittr_1.5      rmarkdown_2.1     data.table_1.12.6
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3        compiler_3.4.4    pillar_1.4.3      highr_0.8        
##  [5] tools_3.4.4       digest_0.6.22     evaluate_0.14     lifecycle_0.2.0  
##  [9] tibble_3.0.1      gtable_0.3.0      pkgconfig_2.0.3   rlang_0.4.5      
## [13] yaml_2.2.0        xfun_0.11         withr_2.1.2       stringr_1.4.0    
## [17] dplyr_0.8.3       knitr_1.26        htmlwidgets_1.5.1 vctrs_0.2.4      
## [21] grid_3.4.4        tidyselect_0.2.5  glue_1.3.1        R6_2.4.1         
## [25] purrr_0.3.3       scales_1.1.0      htmltools_0.4.0   ellipsis_0.3.0   
## [29] assertthat_0.2.1  colorspace_1.4-1  stringi_1.4.3     lazyeval_0.2.2   
## [33] munsell_0.5.0     crayon_1.3.4

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.1 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Work Experience data set

After the phase of exploratory data analysis the following fields are identified as meaningful variables concerning the Work Experience dataset.

fieldsMeasure <- c("SkillsPassport.Locale", 
                   "WorkExperience.Period.From.Year",
                   "WorkExperience.Period.To.Year",
                   "WorkExperience.Position",
                   "Employer.Name"
)
fieldsMeasure

## [1] "SkillsPassport.Locale"           "WorkExperience.Period.From.Year"
## [3] "WorkExperience.Period.To.Year"   "WorkExperience.Position"        
## [5] "Employer.Name"

Read unique CVs

The JSON data are parsed in batches and the relevant fields are measured. For each batch a tabular dataset is calculated and saved in binary form.

chunkRepo <- getSourcePath("jobsOutput/workEXPChunks/")
unlink(list.files(chunkRepo, full.names = TRUE))%>%on.exit

filesToAnalyseData <- readRDS(file = getSourcePath("jobsOutput/jsonFilesToAnalyse.rds"))
filesToAnalyse <- filesToAnalyseData$jsonFile
json_files_split <- split(filesToAnalyse, 1:20)

v_grepl <- Vectorize(grepl, vectorize.args = "pattern")

no_cores <- detectCores() - 1 
cl <- makeCluster(no_cores, type = "FORK")
deafen <- parLapply(cl, json_files_split, function(json_chunk){
  json_DT <- lapply(json_chunk, function(x){
    jsonList <- fromJSON(x, flatten = TRUE)
    jsonVec <- unlist(jsonList, recursive = TRUE)
    existFields <- v_grepl(fieldsMeasure, names(jsonVec)) 
    existFields <- apply(existFields, 1, function(x)sum(x) != 0)
    resultVec <- jsonVec[existFields]
    result <- resultVec %>% data.table %>% transpose
    setnames(result, names(resultVec))
  }) %>% rbindlist(fill = TRUE)
  json_DT[, id := json_chunk]
  uniqueNum <- as.numeric(Sys.time())
  saveRDS(object = json_DT, file = paste0(chunkRepo, "workEXP", uniqueNum, ".rds"))
}) 
stopCluster(cl)
#binding chunks
dataChunks <- list.files(path = chunkRepo, full.names = TRUE)
mergeChunksDT <- lapply(dataChunks, function(x){
  readRDS(x)
}) %>% rbindlist(fill = TRUE, use.names = TRUE)

Keep variables with more than 2% completion.

missingPerCol <- sapply(mergeChunksDT, function(x){
  sumNa <- sum(is.na(x))
  sumNa/length(x)
})
mergeChunksDT <- mergeChunksDT[, missingPerCol < 0.98, with = FALSE]
varNames <- names(mergeChunksDT)

Cast table in long range format.

regexToMelt <- c("From.Year", "To.Year", "Position.Label", "Position.Code", "Employer.Name")
colsBoolMatrix <- v_grepl(regexToMelt, varNames)
colsMelt <- apply(colsBoolMatrix, 1, function(x)sum(x) != 0)
dataToMelt <- mergeChunksDT[, colsMelt, with = FALSE]
moltenLevelsCols <- gsub("[^0-9]", "", varNames)[colsMelt]
moltenLevels <- moltenLevelsCols %>% unique
dataStack <- dataToMelt[, moltenLevelsCols %in% moltenLevels[1], with = FALSE] 
dataStack <- cbind(dataStack, mergeChunksDT[, !colsMelt, with = FALSE])
setnames(dataStack, gsub("[0-9]", "", names(dataStack)))
deaf <- lapply(moltenLevels[-1], function(x){
  dat <- dataToMelt[, moltenLevelsCols %in% x, with = FALSE]
  setnames(dat, gsub("[0-9]", "", names(dat)))
  dat <- cbind(dat, mergeChunksDT[, !colsMelt, with = FALSE])
  dataStack <<- rbind(dataStack, dat, fill = TRUE, use.names = TRUE)
})
setnames(dataStack,  gsub("^[^\\.]*\\.", "", names(dataStack)))
setnames(dataStack,  gsub("^[^\\.]*\\.[^\\.]*\\.[^\\.]*\\.", "", names(dataStack)))
dataStack %>% names

## [1] "From.Year" "To.Year"   "Label"     "Name"      "Code"      "Locale"   
## [7] "id"

setnames(
  dataStack, 
  c("From.Year", "To.Year", "Label", "Code", "Name", "Locale", "id"), 
  c("from", "to", "label", "code", "employer", "locale", "id")
)
dataStack[, id := cleanJsonId(id)]

Save data

saveRDS(dataStack, getSourcePath("jobsOutput/workEXPDT.rds"))

Summary statistics

summariseTable(dataStack)

Sample of the resulting data

reportTabularData(dataStack %>% head(1000))

## NULL

timeNeeded <- (proc.time()[3] -  source.starting.time);

3.getWorkExperience.R completed in 2330.4 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 2331.71 seconds.

Subpart metrics

reportTabularData(source.blocks);

## NULL

Cleansing JSON CV (Work Experience).

EPAS

Report generated on

2020-04-26 14:53:03