Abstract
Combining data of stratified CV and performing aggregations and statistical computations to constract anonymized tidy datasets. New features and statistics are collected for work and qualifications.librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: magrittr
## Loading required package: DT
## Loading required package: ggplot2
## Loading required package: jsonlite
## Loading required package: parallel
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
../000.core/00.01.libraries.R completed in 0.93 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# @authors kp@eworx.gr ako@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#alternative for rough read write operations
saveRDS_ <- function(object, file){
dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
saveRDS(object, file)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data){
columnsName <- names(data)
#columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
cleanJsonId <- function(txt){
txt <- gsub("\\.json", "", txt)
gsub(".*/", "", txt)
}
embed_data <- function(x= mtcars, filename= "file.csv", label= "Get data"){
# Create encoded Base64 datastream
encode_data= function(x){
saveMe <- getSourcePath("file.csv")
write.csv2(x, saveMe)
enc= sprintf('data:text/csv;base64,%s', openssl::base64_encode(paste0(readLines(saveMe), collapse="\n")) )
unlink(saveMe)
return(enc)
}
# String result ready to be placed in rmarkdown
paste0("<a download='", filename, "' href=", encode_data(x), ">", label, "</a>")
}
###########################################################################################################
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.7.10 jsonlite_1.7.2 ggplot2_3.3.3 DT_0.18
## [5] magrittr_2.0.1 rmarkdown_2.8 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 bslib_0.2.5 compiler_4.0.5 pillar_1.6.0
## [5] jquerylib_0.1.4 highr_0.9 tools_4.0.5 digest_0.6.27
## [9] evaluate_0.14 lifecycle_1.0.0 tibble_3.1.1 gtable_0.3.0
## [13] pkgconfig_2.0.3 rlang_0.4.11 yaml_2.2.1 xfun_0.23
## [17] withr_2.4.2 stringr_1.4.0 dplyr_1.0.6 knitr_1.33
## [21] generics_0.1.0 htmlwidgets_1.5.3 sass_0.4.0 vctrs_0.3.8
## [25] grid_4.0.5 tidyselect_1.1.1 glue_1.4.2 R6_2.5.0
## [29] fansi_0.4.2 purrr_0.3.4 scales_1.1.1 htmltools_0.5.1.1
## [33] ellipsis_0.3.2 colorspace_2.0-1 utf8_1.2.1 stringi_1.6.2
## [37] munsell_0.5.0 crayon_1.4.1
../000.core/00.02.base.functions.R completed in 0.07 seconds
Dimensions: ****
fileName <- "jobsOutput/demographDT.rds"
demographDT <- loadBinary(fileName)
demographDT <- demographDT[id %in% strataData]
Dimensions: 353518, 9
fileName <- "jobsOutput/workEXPDT.rds"
workEXPDT <- loadBinary(fileName)%>%unique
workEXPDT <- workEXPDT[id %in% strataData]
Dimensions: 1699275, 7
fileName <- "jobsOutput/education/predictions/cvPredictions.rds"
educationDT <- loadBinary(fileName)%>%unique
educationDT <- educationDT[id %in% strataData]
Dimensions: 332553, 6
fileName <- "jobsOutput/workExperience/occupationsForMatching/headlinesEscoDT.fst"
headlineDT <- loadBinary(fileName, format = "fst") %>% unique
headlineDT <- headlineDT[id %in% strataData]
Dimensions: 353518, 15
1.load.data.R completed in 77.49 seconds
headlineData <- headlineDT[type_code %in% c("job_applied_for", "preferred_job", "position"), .(id, type_code, iscoLabel)]
setnames(headlineData, c("id", "headline_type", "headline_isco"))
demographStat <- merge(demographStat, headlineData, by = "id", all.x = TRUE)
2.process.data.R completed in 13.99 seconds
fileName <- "jobsOutput/tidySurvey/demographStat.fst"
saveBinary(demographStat, fileName, format = "fst")
head(demographStat)
## id locale creationDate
## Length:353518 Length:353518 Min. :2005-10-22 00:00:00
## Class :character Class :character 1st Qu.:2018-05-07 14:23:36
## Mode :character Mode :character Median :2019-07-16 11:01:21
## Mean :2018-09-05 18:27:23
## 3rd Qu.:2019-08-30 06:47:02
## Max. :2019-09-30 04:50:31
## NA's :74
## lastUpdate postalcode country
## Min. :2019-07-15 12:09:20 Length:353518 Length:353518
## 1st Qu.:2019-08-11 07:47:52 Class :character Class :character
## Median :2019-09-02 13:42:03 Mode :character Mode :character
## Mean :2019-08-28 21:22:30
## 3rd Qu.:2019-09-16 18:37:40
## Max. :2033-01-11 00:16:39
##
## gender birthdate nationality work_years
## Length:353518 Length:353518 Length:353518 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 3.00
## Mode :character Mode :character Mode :character Median : 6.00
## Mean : 9.27
## 3rd Qu.: 12.00
## Max. :312.00
## NA's :51194
## num_work min_work_years max_work_years mean_work_years
## Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.: 2.00 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.: 1.00
## Median : 4.00 Median : 1.00 Median : 2.00 Median : 1.50
## Mean : 4.19 Mean : 1.63 Mean : 4.37 Mean : 2.53
## 3rd Qu.: 6.00 3rd Qu.: 1.00 3rd Qu.: 5.00 3rd Qu.: 2.75
## Max. :13.00 Max. :55.00 Max. :57.00 Max. :55.00
## NA's :42107 NA's :51194 NA's :51194 NA's :51194
## is_employed eqf_level eqf_previous is_student
## Mode :logical Min. :1.00 Min. :1.00 Mode :logical
## FALSE:157306 1st Qu.:4.00 1st Qu.:4.00 FALSE:247364
## TRUE :154105 Median :6.00 Median :6.00 TRUE :85189
## NA's :42107 Mean :5.21 Mean :5.17 NA's :20965
## 3rd Qu.:6.00 3rd Qu.:6.00
## Max. :8.00 Max. :8.00
## NA's :107677 NA's :298489
## headline_type headline_isco
## Length:353518 Length:353518
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
Datasource : /data/generic/jobsOutput/tidySurvey/demographStat.fst of 45,093,922 bytes.
3.save.data.R completed in 1.69 seconds
Completed in 94.17 seconds.
An Eworx S.A. DSENSE - A documented process for internal consumption.
– end of report –