Abstract
This function loads the skills data sets for all different locale, cleans free text and keeps only relevant information. In particular, for each esco skill, the Uri, the skill name and the associated text is saved for each language, as it is provided by ESCO.# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]
Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.
librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: DT
## Loading required package: stringr
## Loading required package: xml2
## Loading required package: parallel
../000.core/00.01.libraries.R completed in 0.49 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#alternative for rough read write operations
saveRDS_ <- function(object, file){
dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
saveRDS(object, file)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data, anonymize=TRUE){
if(anonymize)return()
columnsName <- names(data)
columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
#result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
#read_xml_to_list <- function(filepath, is.gz = FALSE){
# if(is.gz){
# temp_data <- paste0(repository, "data/delete.me")
# result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
# Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
# unlink(temp_data)
# result
# }else{
# xmlToList(xmlParse(filepath))
# }
#}
#transpose_list_to_dt <- function(data_list){
# dt <- t(as.data.table(data_list))
# dt <- as.data.table(dt)
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# names(dt) <- names(data_list[[1]])
# dt
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
trimws(text)
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
days <- "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
text <- gsub(date_form1, " ", text)
gsub(date_form2, " ", text)
}
xmlToDataTable <- function(xmlData, itemNames){
itemList <- lapply(itemNames,
function(x){
xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
}
)
names(itemList) <- xmlItems
as.data.table(itemList)
}
cleanCorpusHtml <- function(text){
unlist(lapply(text, function(x){
if(nchar(x) > 0){
# because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description
# starts with the main content
html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated
xml <- read_xml(html, as_html = TRUE)
lis <- xml_find_all(xml, ".//li")
xml_remove(lis)
text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
text <- gsub("\\s+"," ", text)
}else {""}
}))
}
#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
lim <- length(vct)
fstep <- lim%/%n_chunks
idx_list <- list()
for(i in seq(n_chunks - 1)){
idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
}
idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
return(idx_list)
}
#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
vct <- ifelse(vct > threshold, vct, Inf)
indexVec <- head(order(vct), numHead)
idVec[indexVec]
}
#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}
#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
if(length(result) > 0)
data.table(elementType = elementType, attributeValue = result)
else
data.table()
}
#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
nodes <- read_xml(paste0("<div>", feedItem, "</div>"), as_html = TRUE)
rbind(
elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
#All "img-src" are NA
)
}
#retrieve list of parameters in a http request query
getQueryParams <- function(url){
query <- httr::parse_url(url)$query
queryValues <- unlist(query)
queryNames <- names(query)
dat <- data.table(varName = queryNames, value = queryValues)
dat[queryValues != ""]
}
keepCountryName <- function(string){
string <- gsub(".*_", "", string)
gsub("\\..*", "", string)
}
keepNTokens <- function(string, num){
tokenList <- strsplit(string, split = " ")
sapply(tokenList, function(tokens){
tokens <- sort(tokens)
tokensShift <- shift(tokens, -num, fill = FALSE)
paste(tokens[tokens != tokensShift], collapse = " ")
})
}
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
tokensList <- strsplit(corpus[, text], " ")
names(tokensList) <- corpus[, code]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count), by = "class"]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
}
###########################################################################################################
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] xml2_1.3.2 stringr_1.4.0 DT_0.18 dplyr_1.0.6
## [5] rmarkdown_2.8 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] knitr_1.33 magrittr_2.0.1 tidyselect_1.1.1 R6_2.5.0
## [5] rlang_0.4.11 fansi_0.4.2 highr_0.9 tools_4.0.5
## [9] xfun_0.23 utf8_1.2.1 jquerylib_0.1.4 htmltools_0.5.1.1
## [13] ellipsis_0.3.2 yaml_2.2.1 digest_0.6.27 tibble_3.1.1
## [17] lifecycle_1.0.0 crayon_1.4.1 purrr_0.3.4 htmlwidgets_1.5.3
## [21] sass_0.4.0 vctrs_0.3.8 glue_1.4.2 evaluate_0.14
## [25] stringi_1.6.2 compiler_4.0.5 bslib_0.2.5 pillar_1.6.0
## [29] generics_0.1.0 jsonlite_1.7.2 pkgconfig_2.0.3
../000.core/00.02.base.functions.R completed in 0.1 seconds
Retrieving all locale as provided by ESCO and loading corresponding .csv files.
inputWorkingDir <- paste0(repository, "input/skills/")
locales <- list.files(inputWorkingDir) %>%
keepCountryName()
Languages: ar, bg, cs, da, de, el, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.
skillsListDt <- lapply(locales, function(x){
fread(paste0(inputWorkingDir, "skills_", x, ".csv"), na.strings = c("", "NULL"))
})
names(skillsListDt) <- locales
## conceptType conceptUri skillType reuseLevel
## Length:13485 Length:13485 Length:13485 Length:13485
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## preferredLabel altLabels hiddenLabels status
## Length:13485 Length:13485 Length:13485 Length:13485
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## modifiedDate scopeNote definition
## Min. :2016-05-02 16:42:36 Length:13485 Length:13485
## 1st Qu.:2016-12-20 18:16:02 Class :character Class :character
## Median :2016-12-20 19:35:15 Mode :character Mode :character
## Mean :2017-01-24 08:29:02
## 3rd Qu.:2016-12-20 20:51:29
## Max. :2020-06-17 16:07:31
## inScheme description
## Length:13485 Length:13485
## Class :character Class :character
## Mode :character Mode :character
##
##
##
hasData <- lapply(skillsListDt, function(data){
sapply(data, function(x)sum(!is.na(x))/length(x)) %>%
data.table %>%
transpose
}) %>% rbindlist(idcol = TRUE)
setnames(hasData, names(hasData)[-1], names(skillsListDt[[1]]))
## .id conceptType conceptUri skillType reuseLevel
## Length:27 Min. :1 Min. :1 Min. :1 Min. :1
## Class :character 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1
## Mode :character Median :1 Median :1 Median :1 Median :1
## Mean :1 Mean :1 Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1 Max. :1 Max. :1
## preferredLabel altLabels hiddenLabels status
## Min. :0.9999 Min. :0.000964 Min. :0.002373 Min. :1
## 1st Qu.:1.0000 1st Qu.:0.051353 1st Qu.:0.002373 1st Qu.:1
## Median :1.0000 Median :0.105006 Median :0.002373 Median :1
## Mean :1.0000 Mean :0.165495 Mean :0.002398 Mean :1
## 3rd Qu.:1.0000 3rd Qu.:0.169188 3rd Qu.:0.002373 3rd Qu.:1
## Max. :1.0000 Max. :0.962773 Max. :0.002595 Max. :1
## modifiedDate scopeNote definition inScheme
## Min. :1 Min. :0.0000000 Min. :0.000e+00 Min. :1
## 1st Qu.:1 1st Qu.:0.0000000 1st Qu.:0.000e+00 1st Qu.:1
## Median :1 Median :0.0000000 Median :0.000e+00 Median :1
## Mean :1 Mean :0.0006592 Mean :8.240e-06 Mean :1
## 3rd Qu.:1 3rd Qu.:0.0000000 3rd Qu.:0.000e+00 3rd Qu.:1
## Max. :1 Max. :0.0177976 Max. :2.225e-04 Max. :1
## description
## Min. :0.9999
## 1st Qu.:1.0000
## Median :1.0000
## Mean :1.0000
## 3rd Qu.:1.0000
## Max. :1.0000
1.load.data.R completed in 3.9 seconds
Removing columns with low information and cleansing text for each language respectively.
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type = "FORK")
skillsListDtClean <- parLapply(cl, locales, function(x) {
columnsToKeep <- c("conceptUri", "skillType", "preferredLabel", "altLabels", "description")
dt <- skillsListDt[[x]][, columnsToKeep, with = FALSE]
dt[, displayTitle := preferredLabel]
#cleansing text
dt[, skillType := gsub(".*/", "", skillType)]
dt[, conceptUri := gsub(".*/", "", conceptUri)]
for (j in columnsToKeep[-1])
set(dt, j = j, value = cleansingCorpus(dt[[j]]))
dt
})
stopCluster(cl)
names(skillsListDtClean) <- locales
## NULL
2.process.data.R completed in 29.11 seconds
Datasource : /data/generic/jobsOutput/skills/skillsListDtClean.rds of 39,646,671 bytes.
repo <- "jobsOutput/skills/escoSkillsClean/"
no_answer <-
lapply(locales, function(x) {
name <- paste0(repo, "processed_skills_", x, ".rds")
saveBinary(skillsListDtClean[[x]], filename = name,
baseFolder = repository)
})
Datasource : /data/generic/jobsOutput/skills/escoSkillsClean/.
3.save.data.R completed in 22.44 seconds
Completed in 56.03 seconds.