sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: magrittr

## Loading required package: DT

## Loading required package: ggplot2

## Loading required package: jsonlite

## Loading required package: parallel

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 0.93 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
  fileName <- getSourcePath(filename, baseFolder)
  dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
  if(format == "rds") saveRDS(data, fileName)
  if(format == "fst") fst::write_fst(data, fileName)
}

#alternative for rough read write operations
saveRDS_ <- function(object, file){
  dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  saveRDS(object, file)
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data){
    
    columnsName <- names(data)
    #columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

cleanJsonId <- function(txt){
    txt <- gsub("\\.json", "", txt)
    gsub(".*/", "", txt)
}

embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){

  # Create encoded Base64 datastream 
  encode_data= function(x){
    saveMe <- getSourcePath("file.csv")
    write.csv2(x, saveMe)
    enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
    unlink(saveMe)
    return(enc)
  }

  # String result ready to be placed in rmarkdown
  paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")

}

###########################################################################################################

Libraries version

if(exists("libraries")){
    data.table(library = libraries, version = librariesVersion)
}

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] lubridate_1.7.10  jsonlite_1.7.2    ggplot2_3.3.3     DT_0.18          
## [5] magrittr_2.0.1    rmarkdown_2.8     data.table_1.14.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        bslib_0.2.5       compiler_4.0.5    pillar_1.6.0     
##  [5] jquerylib_0.1.4   highr_0.9         tools_4.0.5       digest_0.6.27    
##  [9] evaluate_0.14     lifecycle_1.0.0   tibble_3.1.1      gtable_0.3.0     
## [13] pkgconfig_2.0.3   rlang_0.4.11      yaml_2.2.1        xfun_0.23        
## [17] withr_2.4.2       stringr_1.4.0     dplyr_1.0.6       knitr_1.33       
## [21] generics_0.1.0    htmlwidgets_1.5.3 sass_0.4.0        vctrs_0.3.8      
## [25] grid_4.0.5        tidyselect_1.1.1  glue_1.4.2        R6_2.5.0         
## [29] fansi_0.4.2       purrr_0.3.4       scales_1.1.1      htmltools_0.5.1.1
## [33] ellipsis_0.3.2    colorspace_2.0-1  utf8_1.2.1        stringi_1.6.2    
## [37] munsell_0.5.0     crayon_1.4.1

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.07 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Load sample data

Loading sample info.

fileName <- "jobsOutput/strataData.rds"
strataData <- loadBinary(fileName)

Dimensions: ****

Loading Demographic data.

fileName <- "jobsOutput/demographDT.rds"
demographDT <- loadBinary(fileName)
demographDT <- demographDT[id %in% strataData]

Dimensions: 353518, 9

Loading Work Experience data.

fileName <- "jobsOutput/workEXPDT.rds"
workEXPDT <- loadBinary(fileName)%>%unique
workEXPDT <- workEXPDT[id %in% strataData]

Dimensions: 1699275, 7

Loading Education data.

fileName <- "jobsOutput/education/predictions/cvPredictions.rds"
educationDT <- loadBinary(fileName)%>%unique
educationDT <- educationDT[id %in% strataData]

Dimensions: 332553, 6

Loading Headline data.

fileName <- "jobsOutput/workExperience/occupationsForMatching/headlinesEscoDT.fst"
headlineDT <- loadBinary(fileName, format = "fst") %>% unique
headlineDT <- headlineDT[id %in% strataData]

Dimensions: 353518, 15

timeNeeded <- (proc.time()[3] -  source.starting.time);

1.load.data.R completed in 77.49 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Collecting demographic features and statistics.

Work related features

Removing unlabeled work experience observations and outliers.

workEXPDT <- workEXPDT[!is.na(label)]
workEXPDT[ , from := as.numeric(from)]
workEXPDT[ , to := as.numeric(to)]
workEXPDT[ to > 2025, to := NA]
workEXPDT[ from < 1960, from := NA]
workEXPDT <- workEXPDT[-which(to < from)]

The work experience data set is processed to add new demographic features and statistics in the data model. Namely,

Working Years.

workEXPDT[ , years := to - from]
workEXPDT[is.na(to) & (!is.na(from)), years := 2019 - from]
workEXPDT[from == to, years := 1]
workEXPDT[years <= 0 | years == Inf, years := NA]

workEXPDT[, work_years := sum(years, na.rm = TRUE), by = "id"]
workEXPDT[work_years <= 0 | work_years == Inf, work_years := NA]

Employment Status.

workEXPDT[, employed := NA]
workEXPDT[is.na(to) & (!is.na(from)), employed := TRUE]
workEXPDT[(!is.na(to)) & (!is.na(from)), employed := FALSE]

Number of work experiences.

workEXPDT[, num_work := .N, by = "id"]

Mean years staying in a job.

workEXPDT[, mean_work_years := mean(years, na.rm = TRUE), by = "id"]

Max years staying in a job.

workEXPDT[, max_work_years := max(years, na.rm = TRUE), by = "id"]

Min years staying in a job.

workEXPDT[, min_work_years := min(years, na.rm = TRUE), by = "id"]
numCols <- which(sapply(workEXPDT, is.numeric))
for(j in numCols){
 set(workEXPDT, j = j, value = ifelse(workEXPDT[[j]] %in% c(Inf, -Inf), NA, workEXPDT[[j]]))
}

Join with dempographic data.

workDemograph <- workEXPDT[, .(id, work_years, employed, num_work, min_work_years, max_work_years, mean_work_years)]
workDemograph[, is_employed := sum(employed, na.rm = TRUE) > 0, by = "id"]
workDemograph <- workDemograph[, employed := NULL] %>% unique
demographStat <- merge(demographDT, workDemograph, by = "id", all.x = TRUE)

Education related features

Keep the qualifications that have a starting date(from), and from those keep the CV that have less that 2 NA end dates(to).

educationDT[is.na(student), student := FALSE]
educationDT[, level := ifelse(student == TRUE, ongoing_level, complete_level)]
educationDT[, previous := ifelse(student == TRUE, complete_level, NA)]
educationEqfLevel <- educationDT[, .(id, level, previous, student)]
setnames(educationEqfLevel, c("id", "eqf_level", "eqf_previous", "is_student"))
demographStat <- merge(demographStat, educationEqfLevel, by = "id", all.x = TRUE)

Adding Headline

headlineData <- headlineDT[type_code %in% c("job_applied_for", "preferred_job", "position"), .(id, type_code, iscoLabel)]
setnames(headlineData, c("id", "headline_type", "headline_isco"))
demographStat <- merge(demographStat, headlineData, by = "id", all.x = TRUE)

timeNeeded <- (proc.time()[3] -  source.starting.time);

2.process.data.R completed in 13.99 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Save data

fileName <- "jobsOutput/tidySurvey/demographStat.fst"
saveBinary(demographStat, fileName, format = "fst") 
head(demographStat)

summary(demographStat)

##       id               locale           creationDate                
##  Length:353518      Length:353518      Min.   :2005-10-22 00:00:00  
##  Class :character   Class :character   1st Qu.:2018-05-07 14:23:36  
##  Mode  :character   Mode  :character   Median :2019-07-16 11:01:21  
##                                        Mean   :2018-09-05 18:27:23  
##                                        3rd Qu.:2019-08-30 06:47:02  
##                                        Max.   :2019-09-30 04:50:31  
##                                        NA's   :74                   
##    lastUpdate                   postalcode          country         
##  Min.   :2019-07-15 12:09:20   Length:353518      Length:353518     
##  1st Qu.:2019-08-11 07:47:52   Class :character   Class :character  
##  Median :2019-09-02 13:42:03   Mode  :character   Mode  :character  
##  Mean   :2019-08-28 21:22:30                                        
##  3rd Qu.:2019-09-16 18:37:40                                        
##  Max.   :2033-01-11 00:16:39                                        
##                                                                     
##     gender           birthdate         nationality          work_years    
##  Length:353518      Length:353518      Length:353518      Min.   :  1.00  
##  Class :character   Class :character   Class :character   1st Qu.:  3.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  6.00  
##                                                           Mean   :  9.27  
##                                                           3rd Qu.: 12.00  
##                                                           Max.   :312.00  
##                                                           NA's   :51194   
##     num_work     min_work_years  max_work_years  mean_work_years
##  Min.   : 1.00   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.: 2.00   1st Qu.: 1.00   1st Qu.: 1.00   1st Qu.: 1.00  
##  Median : 4.00   Median : 1.00   Median : 2.00   Median : 1.50  
##  Mean   : 4.19   Mean   : 1.63   Mean   : 4.37   Mean   : 2.53  
##  3rd Qu.: 6.00   3rd Qu.: 1.00   3rd Qu.: 5.00   3rd Qu.: 2.75  
##  Max.   :13.00   Max.   :55.00   Max.   :57.00   Max.   :55.00  
##  NA's   :42107   NA's   :51194   NA's   :51194   NA's   :51194  
##  is_employed       eqf_level       eqf_previous    is_student     
##  Mode :logical   Min.   :1.00     Min.   :1.00     Mode :logical  
##  FALSE:157306    1st Qu.:4.00     1st Qu.:4.00     FALSE:247364   
##  TRUE :154105    Median :6.00     Median :6.00     TRUE :85189    
##  NA's :42107     Mean   :5.21     Mean   :5.17     NA's :20965    
##                  3rd Qu.:6.00     3rd Qu.:6.00                    
##                  Max.   :8.00     Max.   :8.00                    
##                  NA's   :107677   NA's   :298489                  
##  headline_type      headline_isco     
##  Length:353518      Length:353518     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##

Datasource : /data/generic/jobsOutput/tidySurvey/demographStat.fst of 45,093,922 bytes.

timeNeeded <- (proc.time()[3] -  source.starting.time);

3.save.data.R completed in 1.69 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 94.17 seconds.

Subpart metrics

reportTabularData(source.blocks);

Tidying stratified demographic data.

EPAS

Report generated on

2021-07-05 11:25:44