sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: magrittr

## Loading required package: DT

## Loading required package: ggplot2

## Loading required package: jsonlite

## Loading required package: parallel

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 0.83 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
  fileName <- getSourcePath(filename, baseFolder)
  dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
  if(format == "rds") saveRDS(data, fileName)
  if(format == "fst") fst::write_fst(data, fileName)
}

#alternative for rough read write operations
saveRDS_ <- function(object, file){
  dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  saveRDS(object, file)
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data){
    
    columnsName <- names(data)
    #columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

cleanJsonId <- function(txt){
    txt <- gsub("\\.json", "", txt)
    gsub(".*/", "", txt)
}

embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){

  # Create encoded Base64 datastream 
  encode_data= function(x){
    saveMe <- getSourcePath("file.csv")
    write.csv2(x, saveMe)
    enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
    unlink(saveMe)
    return(enc)
  }

  # String result ready to be placed in rmarkdown
  paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")

}

###########################################################################################################

Libraries version

if(exists("libraries")){
    data.table(library = libraries, version = librariesVersion)
}

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] lubridate_1.7.10  jsonlite_1.7.2    ggplot2_3.3.3     DT_0.18          
## [5] magrittr_2.0.1    rmarkdown_2.8     data.table_1.14.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        bslib_0.2.5       compiler_4.0.5    pillar_1.6.0     
##  [5] jquerylib_0.1.4   highr_0.9         tools_4.0.5       digest_0.6.27    
##  [9] evaluate_0.14     lifecycle_1.0.0   tibble_3.1.1      gtable_0.3.0     
## [13] pkgconfig_2.0.3   rlang_0.4.11      yaml_2.2.1        xfun_0.23        
## [17] withr_2.4.2       stringr_1.4.0     dplyr_1.0.6       knitr_1.33       
## [21] generics_0.1.0    htmlwidgets_1.5.3 sass_0.4.0        vctrs_0.3.8      
## [25] grid_4.0.5        tidyselect_1.1.1  glue_1.4.2        R6_2.5.0         
## [29] fansi_0.4.2       purrr_0.3.4       scales_1.1.1      htmltools_0.5.1.1
## [33] ellipsis_0.3.2    colorspace_2.0-1  utf8_1.2.1        stringi_1.6.2    
## [37] munsell_0.5.0     crayon_1.4.1

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.07 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Load sample data

Loading sample info.

fileName <- "jobsOutput/strataData.rds"
strataData <- loadBinary(fileName)

Dimensions: 353518

Loading Demographic data.

fileName <- "jobsOutput/tidySurvey/demographStat.fst"
demographStat <- loadBinary(fileName, format = "fst")
demographStat <- demographStat[id %in% strataData]

Dimensions: 353518, 20

Loading Work predict.

fileName <- "jobsOutput/workExperience/occupationsForMatching/workExpEscoDT.fst"
workPredict <- loadBinary(fileName, format = "fst") %>% unique
workPredict <- workPredict[id %in% strataData]

Dimensions: 1699275, 12

Loading esco-isco mapping

inputRepo <- getSourcePath("jobsOutput/")
filename <- "escoIscoMapping.rds"
escoIscoMapping <- loadBinary(filename, inputRepo)

timeNeeded <- (proc.time()[3] -  source.starting.time);

1.load.data.R completed in 40.96 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Collecting demographic features and statistics.

Work related features

Removing unlabeled work experience observations and removing outliers.

workPredict <- workPredict[!is.na(label)]
workPredict[ , from := as.numeric(from)]
workPredict[ , to := as.numeric(to)]
workPredict[ to > 2025, to := NA]
workPredict[ from < 1960, from := NA]
workPredict <- workPredict[-which(to < from)]

workPredict[, URI := code]
workPredict[nchar(URI) == 5, URI := NA]
workPredict[is.na(URI), URI := suggestedUri]
workPredict <- workPredict[, .(from, to, label, employer, locale, id, code, URI)]

ISCO hierarchy

setnames(escoIscoMapping, "occupationCode", "URI")
workPredict <- merge(workPredict, escoIscoMapping, by = "URI", all.x = TRUE)

ESCO aggregations

workDemo <- merge(workPredict[ , locale := NULL], demographStat, by = "id")

aggregateWork <- workDemo[!is.na(URI)][ , .(count = .N), by = "URI"]
aggregateWork <- merge(aggregateWork, escoIscoMapping, by = "URI", all.x = TRUE)
aggregateWork <- aggregateWork[order(-count)]

Fuzzy match popular employers.

To identify and de-duplicate the employers variable in the work experience data set:

First, the employers free-text is aggregated and the top 100 keywords/phrases are treated as the top employers.
Then, a manual anotation process is followed to identify the most common part of keywords/phrases referring to the same employers by looking at the most popular employers and using “grep”, for example “donald” for “McDonald’s” and “Mc Donald’s”. The keyword that is kept between the keywords/phrases that were identified as the same is either the one in English language or the most popular one.
To identify the freelancers, the private company and the family business employers a similar process is followed, by “greping” part most common part of the free-text and treating those employers as the same.
Future work: a better approach would be to follow a similar matching process like the one that was followed to match the 3 pillars to ESCO. Find the top n employers you want to match the free-text with. Then, cleanse the both the top employers and the free-text and create a new column with the cleansed free-text. After that, use a string distance method, for example levenshtein distance, to match the cleansed free-text and the cleansed, most popular employers.

# TO-DO: Move code in a proper location prior to this step
workDemo[, employer := factor(workDemo$employer, levels = unique(workDemo$employer))]
employers <- workDemo[!is.na(employer), .(count = .N), by = "employer"][order(-count)][count > 3]
levels <- levels(workDemo$employer)

changes <- c("donald" = "McDonald's", "poste italiane" = "Poste Italiane", "pingo" = "Pingo Doce",
       "continente" = "Continente", "zara" = "Zara", "vodafone" = "Vodafone", "carrefour" = "Carrefour",
       "teleperfo" = "Teleperformance", "amazon" = "Amazon", "accenture" = "Accenture", "randstad" = "Randstad",
       "primark" = "Primark", "telekom" = "Deutsche Telekom", "decathlon" = "Decathlon Group", "h&m" = "H&M",
       "hennes" = "H&M", "merlin" = "Leroy Merlin", "lidl" = "Lidl", "er king" = "Burger King", "sonae" = "Sonae",
       "telepizza" = "Telepizza", "adecco" = "Adecco", "ryanair" = "Ryanair", "conad" = "Conad", "dhl" = "DHL",
       "manpower" = "Manpower Group", "aiesec" = "AIESEC", "worten" = "Worten", "auchan" = "Auchan", 
       "pizza hut" = "Pizza Hut", "intermarch" = "Intermarché")

for(i in 1:length(changes)){
  name <- names(changes)[i]
  value <- changes[[i]]
  same <- levels[grepl(name, levels, ignore.case=TRUE)]
  levels[levels %in% same] <- value
}

freelancers <- c("Libero professionista", "Libero Professionista", "Libera professione", "libero professionista", 
                 "Libera professionista", "Libera Professione", "Freelance", "freelance", "Freelancing")
levels[levels %in%  freelancers] <- "Freelancer"

self_employed <- c("Autonomo", "Autónomo", "Autônomo", "Lavoro autonomo", "Lavoratore autonomo",
                   "Self employed", "Self Employed", "Self-Employed", "Self-employed")
levels[levels %in%  self_employed] <- "Freelancer"

private <- c("Privato", "Privati", "privato", "Private", "privati", "PRIVATO", "Privat", "Privado", 
             "Azienda privata")
levels[levels %in%  private] <- "Private Company"

family_business <- c("Azienda di famiglia", "Azienda familiare", "Famiglia", "famiglia privata", 
                     "Famiglia privata", "Famiglia Privata", "Famiglie", "Famiglie private", "Famila")
levels[levels %in%  family_business] <- "Family Business"

levels(workDemo$employer) <- levels

timeNeeded <- (proc.time()[3] -  source.starting.time);

2.process.data.R completed in 44.26 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Save data

fileName <- "jobsOutput/tidySurvey/workStat.fst"
saveBinary(workDemo, fileName, format = "fst")

Datasource : /data/generic/jobsOutput/tidySurvey/workStat.fst of 403,372,557 bytes.

fileName <- "jobsOutput/tidySurvey/aggregateWorkStat.fst"
saveBinary(aggregateWork, fileName, format = "fst")

Datasource : /data/generic/jobsOutput/tidySurvey/aggregateWorkStat.fst of 456,792 bytes.

timeNeeded <- (proc.time()[3] -  source.starting.time);

3.save.data.R completed in 2.27 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 88.38 seconds.

Subpart metrics

reportTabularData(source.blocks);

Scaling and statistics for work experience data set.

EPAS

Report generated on

2021-07-05 11:27:19