Abstract
Combining data of stratified CV and performing aggregations and statistical computations to constract anonymized tidy datasetlibrariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: magrittr
## Loading required package: DT
## Loading required package: ggplot2
## Loading required package: jsonlite
## Loading required package: parallel
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
../000.core/00.01.libraries.R completed in 0.83 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#alternative for rough read write operations
saveRDS_ <- function(object, file){
dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
saveRDS(object, file)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data){
columnsName <- names(data)
#columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
cleanJsonId <- function(txt){
txt <- gsub("\\.json", "", txt)
gsub(".*/", "", txt)
}
embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){
# Create encoded Base64 datastream
encode_data= function(x){
saveMe <- getSourcePath("file.csv")
write.csv2(x, saveMe)
enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
unlink(saveMe)
return(enc)
}
# String result ready to be placed in rmarkdown
paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")
}
###########################################################################################################
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.7.10 jsonlite_1.7.2 ggplot2_3.3.3 DT_0.18
## [5] magrittr_2.0.1 rmarkdown_2.8 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 bslib_0.2.5 compiler_4.0.5 pillar_1.6.0
## [5] jquerylib_0.1.4 highr_0.9 tools_4.0.5 digest_0.6.27
## [9] evaluate_0.14 lifecycle_1.0.0 tibble_3.1.1 gtable_0.3.0
## [13] pkgconfig_2.0.3 rlang_0.4.11 yaml_2.2.1 xfun_0.23
## [17] withr_2.4.2 stringr_1.4.0 dplyr_1.0.6 knitr_1.33
## [21] generics_0.1.0 htmlwidgets_1.5.3 sass_0.4.0 vctrs_0.3.8
## [25] grid_4.0.5 tidyselect_1.1.1 glue_1.4.2 R6_2.5.0
## [29] fansi_0.4.2 purrr_0.3.4 scales_1.1.1 htmltools_0.5.1.1
## [33] ellipsis_0.3.2 colorspace_2.0-1 utf8_1.2.1 stringi_1.6.2
## [37] munsell_0.5.0 crayon_1.4.1
../000.core/00.02.base.functions.R completed in 0.07 seconds
Dimensions: 353518
fileName <- "jobsOutput/tidySurvey/demographStat.fst"
demographStat <- loadBinary(fileName, format = "fst")
demographStat <- demographStat[id %in% strataData]
Dimensions: 353518, 20
fileName <- "jobsOutput/workExperience/occupationsForMatching/workExpEscoDT.fst"
workPredict <- loadBinary(fileName, format = "fst") %>% unique
workPredict <- workPredict[id %in% strataData]
Dimensions: 1699275, 12
inputRepo <- getSourcePath("jobsOutput/")
filename <- "escoIscoMapping.rds"
escoIscoMapping <- loadBinary(filename, inputRepo)
1.load.data.R completed in 40.96 seconds
To identify and de-duplicate the employers variable in the work experience data set:
First, the employers free-text is aggregated and the top 100 keywords/phrases are treated as the top employers.
Then, a manual anotation process is followed to identify the most common part of keywords/phrases referring to the same employers by looking at the most popular employers and using “grep”, for example “donald” for “McDonald’s” and “Mc Donald’s”. The keyword that is kept between the keywords/phrases that were identified as the same is either the one in English language or the most popular one.
To identify the freelancers, the private company and the family business employers a similar process is followed, by “greping” part most common part of the free-text and treating those employers as the same.
Future work: a better approach would be to follow a similar matching process like the one that was followed to match the 3 pillars to ESCO. Find the top n employers you want to match the free-text with. Then, cleanse the both the top employers and the free-text and create a new column with the cleansed free-text. After that, use a string distance method, for example levenshtein distance, to match the cleansed free-text and the cleansed, most popular employers.
# TO-DO: Move code in a proper location prior to this step
workDemo[, employer := factor(workDemo$employer, levels = unique(workDemo$employer))]
employers <- workDemo[!is.na(employer), .(count = .N), by = "employer"][order(-count)][count > 3]
levels <- levels(workDemo$employer)
changes <- c("donald" = "McDonald's", "poste italiane" = "Poste Italiane", "pingo" = "Pingo Doce",
"continente" = "Continente", "zara" = "Zara", "vodafone" = "Vodafone", "carrefour" = "Carrefour",
"teleperfo" = "Teleperformance", "amazon" = "Amazon", "accenture" = "Accenture", "randstad" = "Randstad",
"primark" = "Primark", "telekom" = "Deutsche Telekom", "decathlon" = "Decathlon Group", "h&m" = "H&M",
"hennes" = "H&M", "merlin" = "Leroy Merlin", "lidl" = "Lidl", "er king" = "Burger King", "sonae" = "Sonae",
"telepizza" = "Telepizza", "adecco" = "Adecco", "ryanair" = "Ryanair", "conad" = "Conad", "dhl" = "DHL",
"manpower" = "Manpower Group", "aiesec" = "AIESEC", "worten" = "Worten", "auchan" = "Auchan",
"pizza hut" = "Pizza Hut", "intermarch" = "Intermarché")
for(i in 1:length(changes)){
name <- names(changes)[i]
value <- changes[[i]]
same <- levels[grepl(name, levels, ignore.case=TRUE)]
levels[levels %in% same] <- value
}
freelancers <- c("Libero professionista", "Libero Professionista", "Libera professione", "libero professionista",
"Libera professionista", "Libera Professione", "Freelance", "freelance", "Freelancing")
levels[levels %in% freelancers] <- "Freelancer"
self_employed <- c("Autonomo", "Autónomo", "Autônomo", "Lavoro autonomo", "Lavoratore autonomo",
"Self employed", "Self Employed", "Self-Employed", "Self-employed")
levels[levels %in% self_employed] <- "Freelancer"
private <- c("Privato", "Privati", "privato", "Private", "privati", "PRIVATO", "Privat", "Privado",
"Azienda privata")
levels[levels %in% private] <- "Private Company"
family_business <- c("Azienda di famiglia", "Azienda familiare", "Famiglia", "famiglia privata",
"Famiglia privata", "Famiglia Privata", "Famiglie", "Famiglie private", "Famila")
levels[levels %in% family_business] <- "Family Business"
levels(workDemo$employer) <- levels
2.process.data.R completed in 44.26 seconds
Datasource : /data/generic/jobsOutput/tidySurvey/workStat.fst of 403,372,557 bytes.
fileName <- "jobsOutput/tidySurvey/aggregateWorkStat.fst"
saveBinary(aggregateWork, fileName, format = "fst")
Datasource : /data/generic/jobsOutput/tidySurvey/aggregateWorkStat.fst of 456,792 bytes.
3.save.data.R completed in 2.27 seconds
Completed in 88.38 seconds.
An Eworx S.A. DSENSE - A documented process for internal consumption.
– end of report –