Abstract
The process of occupations identification has two major phases. The matching method is used to identify a free text word in the ESCO vocabulary. The process is language agnostic and the languages used are those in ESCO classification for occupations. The large amount of data, the multilingual nature of the problem and the high computational complexity demands scalable solutions. A low-level implementation in C is used for matching and all intensive calculations are parallelized.# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]
Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.
librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: magrittr
## Loading required package: DT
## Loading required package: text2vec
## Loading required package: stringdist
##
## Attaching package: 'stringdist'
## The following object is masked from 'package:magrittr':
##
## extract
## Loading required package: parallel
## Loading required package: stopwords
../000.core/00.01.libraries.R completed in 1.69 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# EPAS-DS
# @authors ds@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#alternative for rough read write operations
saveRDS_ <- function(object, file){
dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
saveRDS(object, file)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data, anonymize=TRUE){
if(anonymize)return()
columnsName <- names(data)
columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
#result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
#read_xml_to_list <- function(filepath, is.gz = FALSE){
# if(is.gz){
# temp_data <- paste0(repository, "data/delete.me")
# result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
# Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
# unlink(temp_data)
# result
# }else{
# xmlToList(xmlParse(filepath))
# }
#}
#transpose_list_to_dt <- function(data_list){
# dt <- t(as.data.table(data_list))
# dt <- as.data.table(dt)
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# names(dt) <- names(data_list[[1]])
# dt
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
trimws(text)
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
days <- "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
text <- gsub(date_form1, " ", text)
gsub(date_form2, " ", text)
}
xmlToDataTable <- function(xmlData, itemNames){
itemList <- lapply(itemNames,
function(x){
xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
}
)
names(itemList) <- xmlItems
as.data.table(itemList)
}
cleanCorpusHtml <- function(text){
unlist(lapply(text, function(x){
if(nchar(x) > 0){
# because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description
# starts with the main content
html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated
xml <- read_xml(html, as_html = TRUE)
lis <- xml_find_all(xml, ".//li")
xml_remove(lis)
text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
text <- gsub("\\s+"," ", text)
}else {""}
}))
}
#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
lim <- length(vct)
fstep <- lim%/%n_chunks
idx_list <- list()
for(i in seq(n_chunks - 1)){
idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
}
idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
return(idx_list)
}
#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
vct <- ifelse(vct > threshold, vct, Inf)
indexVec <- head(order(vct), numHead)
idVec[indexVec]
}
#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}
#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
if(length(result) > 0)
data.table(elementType = elementType, attributeValue = result)
else
data.table()
}
#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
nodes <- read_xml(paste0("<div>", feedItem, "</div>"), as_html = TRUE)
rbind(
elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
#All "img-src" are NA
)
}
#retrieve list of parameters in a http request query
getQueryParams <- function(url){
query <- httr::parse_url(url)$query
queryValues <- unlist(query)
queryNames <- names(query)
dat <- data.table(varName = queryNames, value = queryValues)
dat[queryValues != ""]
}
keepCountryName <- function(string){
string <- gsub(".*_", "", string)
gsub("\\..*", "", string)
}
keepNTokens <- function(string, num){
tokenList <- strsplit(string, split = " ")
sapply(tokenList, function(tokens){
tokens <- sort(tokens)
tokensShift <- shift(tokens, -num, fill = FALSE)
paste(tokens[tokens != tokensShift], collapse = " ")
})
}
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
tokensList <- strsplit(corpus[, text], " ")
names(tokensList) <- corpus[, code]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count), by = "class"]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
}
tidyJsonData <- function(jsonList){
if(length(jsonList) == 0)return(NULL)
unlistOccupations <- jsonList %>% unlist
codesMaleFemale <- names(unlistOccupations)
epasMapping <- data.table(unlistOccupations)
epasMapping[ , code := gsub("\\.[[:alpha:]]$", "", codesMaleFemale)]
uniqueMappingBoolean <- epasMapping[ , unlistOccupations != c(unlistOccupations[-1], F), by = code]$V1
codesEpasDB <- epasMapping[uniqueMappingBoolean]
names(codesEpasDB) <- c("title", "code")
codesEpasDB[ , title := cleansingCorpus(title)]
}
###########################################################################################################
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] stopwords_2.2 stringdist_0.9.6.3 text2vec_0.6 DT_0.18
## [5] magrittr_2.0.1 rmarkdown_2.8 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 mlapi_0.1.0 knitr_1.33
## [4] RhpcBLASctl_0.20-137 float_0.2-4 lattice_0.20-41
## [7] R6_2.5.0 rlang_0.4.11 lgr_0.4.2
## [10] stringr_1.4.0 highr_0.9 tools_4.0.5
## [13] grid_4.0.5 xfun_0.23 jquerylib_0.1.4
## [16] htmltools_0.5.1.1 yaml_2.2.1 digest_0.6.27
## [19] rsparse_0.4.0 crayon_1.4.1 Matrix_1.3-2
## [22] vctrs_0.3.8 sass_0.4.0 htmlwidgets_1.5.3
## [25] evaluate_0.14 stringi_1.6.2 compiler_4.0.5
## [28] bslib_0.2.5 jsonlite_1.7.2
../000.core/00.02.base.functions.R completed in 0.12 seconds
inputRepo <- getSourcePath("jobsOutput/workExperience/")
filename <- "weightedTokensListEscoEpas.rds"
weightedTokensList <- loadBinary(filename, inputRepo)
Type of data: list.
Dimensions: ****.
Column Names: ar, bg, cs, da, de, el, en-us, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.
filename <- "sortedVocabularyListEscoEpas.rds"
sortedVocabularyList <- loadBinary(filename, inputRepo)
Type of data: list.
Dimensions: ****.
Column Names: ar, bg, cs, da, de, el, en-us, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.
Type of data: data.table, data.frame.
Dimensions: 1053923, 4.
Column Names: id, locale, label, index.
Type of data: data.table, data.frame.
Dimensions: 2741, 2.
Column Names: code, count.
escoRepo <- getSourcePath("input/esco_isco_bundle/esco")
fileNames <- list.files(escoRepo, full.names = TRUE)
escoCountries <- keepCountryName(fileNames)
inputRepo <- getSourcePath("jobsOutput/")
filename <- "escoIscoMapping.rds"
escoIscoMapping <- loadBinary(filename, inputRepo)
Type of data: list.
Number of elements: 28
01.00.load.data.R completed in 10.97 seconds
Tokenising free text entries for each language with multi-threading.
langInFreeText <- freeText$locale %>% unique
langInEsco <- names(weightedTokensList)
languages <- langInFreeText[langInFreeText %in% langInEsco]
freeTextPerLang <- lapply(languages, function(x)freeText[locale == x])
names(freeTextPerLang) <- languages
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type = "FORK")
freeTextTokenList <- parLapply(cl, freeTextPerLang, function(x){
wordTokensUnknown <- word_tokenizer(x$label)
names(wordTokensUnknown) <- paste0(x$index, "_")
res <- wordTokensUnknown %>% unlist
res #[nchar(res) > 3]
})
stopCluster(cl)
Discarding 8% of lowest weights.
weightedTokensList <- lapply(weightedTokensList, function(x){
threshold <- quantile(x$word_weight, thr)
x[word_weight > threshold]
})
02.00.process.data.R completed in 17.99 seconds
This process maps words retrieved from free-text to vocabulary terms of the EPAS backend. The Optimal String Alignment distance is used that allows more types of edit operations. These computations are expensive so a low level interface to C with multithreading is used.
stopwordsLang <- c(stopwords_getlanguages("snowball"), stopwords_getlanguages("misc"))
getMatches <- lapply(seq_along(freeTextTokenList), function(i){
language <- names(freeTextTokenList)[i]
voca <- sortedVocabularyList[[language]]
freeTokens <- freeTextTokenList[[i]]
if(language %in% stopwordsLang) freeTokens <- freeTokens[!freeTokens %in% stopwords(language)]
vocaIndexes <- match(freeTokens, voca)
res <- data.table(index = gsub("_.*", "", names(freeTokens)), word = voca[vocaIndexes])
res <- res[!is.na(index)][!is.na(word)]
res <- res[!duplicated(res)]
})
## Warning: 'stopwords(language = "el")' is deprecated.
## Use 'stopwords(language = "el", source = "misc")' instead.
## See help("Deprecated")
Split the matches in multiple chunks to reduce space complexity.
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
indexChunks <-parLapply(cl, getMatches, function(matches){
indexList <- split(unique(matches$index), 1:100)
lapply(indexList, function(indexes){
which(matches$index %in% indexes)
})
})
stopCluster(cl)
Use the matches to find the top 10 recommended ESCO occupations for each free-text occupation entry.
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
clusterExport(cl, varlist=c("getMatches", "weightedTokensList", "indexChunks", "numReco"))
mergeLangData <- parLapply(cl, languages, function(lang){
library(data.table)
getMatchesLang <- getMatches[[lang]]
weightTokens <- weightedTokensList[[lang]]
res <- lapply(indexChunks[[lang]], function(indexes){
getMatchesLangChunk <- getMatchesLang[indexes]
dat <- weightTokens[getMatchesLangChunk, on = "word", allow.cartesian = TRUE]
dat <- dat[!is.na(code)]
dat <- dat[!duplicated(dat)]
dat[ , .(total_weight = sum(word_weight)), by = c("code", "index")][order(index, -total_weight)][, head(.SD, numReco), by = "index"][, .(code, index, total_weight)]
})
rbindlist(res)
})
stopCluster(cl)
Join the free-text and the predicted occupations.
freeIdToCode <- mergeLangData %>% rbindlist
freeTextPredictions <- merge(freeText, freeIdToCode, on = "index", all = TRUE)
Add the english occupation title next to each predicted occupation.
occupationsESCO <- allEscoList[["en"]]
occupationsESCO[, conceptUri := gsub(".*/", "", conceptUri)]
occupationsESCO[, preferredLabel := trimws(gsub("\\n", " ", preferredLabel))]
freeTextPredictions[, suggestEscoTitle := occupationsESCO$preferredLabel[match(code, occupationsESCO$conceptUri)]]
Add the ISCO level 3 label next to each predicted occupation.
freeTextPredictions[, suggestIscoCode3 := escoIscoMapping$iscoCode3[match(code, escoIscoMapping$occupationCode)]]
freeTextPredictions[, suggestIscoLabel3 := escoIscoMapping$iscoLabel3[match(code, escoIscoMapping$occupationCode)]]
Group the predicted occupations by ISCO lvl 3, sum the weights and keep the ISCO lvl 3 labels with the highest weight sum in a temp variable.
Then, add a new column to the predictions that will contain the predicted ISCO level 3.
Finally, keep the occupations that belong to the ISCO lvl 3 with the highest sum and from those occupations, keep the one with the highest weight inside that ISCO lvl.
freeTextPredictions <- freeTextPredictions[suggestIscoLabel3 == iscoToKeep][, head(.SD, 1), by = "index"][, iscoToKeep := NULL]
setnames(freeTextPredictions, "code", "suggestedUri")
inputRepo <- getSourcePath("jobsOutput/workExperience/")
filename <- "processedWorkExp.fst"
workExpDT <- loadBinary(filename, inputRepo, format = "fst")
workExpEscoDT <- merge(workExpDT, freeTextPredictions[, .(index, suggestedUri, suggestEscoTitle, suggestIscoCode3, suggestIscoLabel3)], by = "index", all.x = TRUE)
03.00.process.data.R completed in 224.46 seconds
Saving work experience related data with all suggested ESCO Uri.
outputRepo <- getSourcePath("jobsOutput/workExperience/occupationsForMatching/")
filename <- "workExpEscoDT.fst"
saveBinary(workExpEscoDT, filename, outputRepo, format = "fst")
data <- workExpEscoDT[!is.na(suggestEscoTitle)]
data[, .(count = .N, percentage = .N*100/nrow(data)), by = suggestEscoTitle][order(-count)] %>% head(50) %>% reportTabularData
## NULL
data <- workExpEscoDT[!is.na(suggestIscoLabel3)]
data[, .(count = .N, percentage = .N*100/nrow(data)), by = suggestIscoLabel3][order(-count)] %>% head(50) %>% reportTabularData
## NULL
Datasource : /data/generic/jobsOutput/workExperience/occupationsForMatching/workExpEscoDT.fst of 334,491,330 bytes.
Datasource : /data/generic/jobsOutput/workExperience/occupationsForMatching/weightedTokensUsed.rds of 44,204,399 bytes.
04.00.save.data.R completed in 15.14 seconds
Completed in 270.38 seconds.