Load, process and tidy ESCO classification data

EPAS database

Report generated on

2021-07-07

Abstract

This process performs all the pre-processing and wrangling of the necessary data-sets to run the skills identification algorithm. The frequency table and the multilingual mappings are used, as they are encoded in the ESCO classification. The data are transformed and weighted. The more common a word is, the less weight it gets, reflecting the fact that commonly used words encode less information. The weight is also analogous to the frequency of an ESCO code measured in the given CVs.

# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: DT

## Loading required package: text2vec

## Loading required package: stopwords

## Loading required package: parallel

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 2 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
    fileName <- getSourcePath(filename, baseFolder)
    dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
    if(format == "rds") saveRDS(data, fileName)
    if(format == "fst") fst::write_fst(data, fileName)
}

#alternative for rough read write operations
saveRDS_ <- function(object, file){
  dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  saveRDS(object, file)
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data, anonymize=TRUE){

    if(anonymize)return()

    columnsName <- names(data)
    columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    #result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

 
 

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

#read_xml_to_list <- function(filepath, is.gz = FALSE){
#   if(is.gz){  
#       temp_data <- paste0(repository, "data/delete.me")
#       result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
#       Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
#       unlink(temp_data)
#       result
#   }else{
#       xmlToList(xmlParse(filepath))
#   }
#}

#transpose_list_to_dt <- function(data_list){
#  dt <- t(as.data.table(data_list))
#  dt <- as.data.table(dt)
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  names(dt) <- names(data_list[[1]])
#  dt
#}

cleansingCorpus <- function(
    htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
    rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE, 
    tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
    rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
  if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
  if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
  if(rem.newline){text <- gsub("[\r\n\t]", " ", text)} 
  if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
  if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
  if(rem.space){text <- gsub("\\s+", " ", text)}  #removing excess space 
  if(tolower){text <- tolower(text)}
  if(add.space.to.numbers){    #add space between number and letters
    text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
    text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
  }
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)} 
  if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
  if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  trimws(text)
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
  days <-  "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
  months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
  date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
  date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
  text <- gsub(date_form1, " ", text)
  gsub(date_form2, " ", text)
}

xmlToDataTable <- function(xmlData, itemNames){
  itemList <- lapply(itemNames,
    function(x){
      xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
    }
  )
  names(itemList) <- xmlItems
  as.data.table(itemList)
}

cleanCorpusHtml <- function(text){
  unlist(lapply(text, function(x){
    if(nchar(x) > 0){
        # because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description 
        # starts with the main content
        html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated 
        xml <- read_xml(html, as_html = TRUE)
        lis <- xml_find_all(xml, ".//li")
        xml_remove(lis)
        text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
        text <- gsub("\\s+"," ",  text)

    }else {""}
  }))
}

#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
  lim <- length(vct)
  fstep <- lim%/%n_chunks
  idx_list <- list()
  for(i in seq(n_chunks - 1)){
    idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
  }
  idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
  return(idx_list)
}

#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
    vct <- ifelse(vct > threshold, vct, Inf)
    indexVec <- head(order(vct), numHead)
    idVec[indexVec]
}


#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
    xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}

#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
    if(length(result) > 0)
        data.table(elementType = elementType, attributeValue = result)
    else
        data.table()
}

#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
    nodes <- read_xml(paste0("<div>",  feedItem, "</div>"), as_html = TRUE)
    rbind(
        elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
        elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
        elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
        #All "img-src" are NA
    )
}

#retrieve list of parameters in a http request query
getQueryParams <- function(url){
  query <- httr::parse_url(url)$query
  queryValues <- unlist(query)
  queryNames <- names(query)
  dat <- data.table(varName = queryNames, value = queryValues)
  dat[queryValues != ""]
}

keepCountryName <- function(string){
  string <- gsub(".*_", "", string)
  gsub("\\..*", "", string)
}

keepNTokens <- function(string, num){
    tokenList <- strsplit(string, split = " ")
    sapply(tokenList, function(tokens){
        tokens <- sort(tokens)
        tokensShift <- shift(tokens, -num, fill = FALSE)
        paste(tokens[tokens != tokensShift], collapse = " ")    
    })
}

findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {

  tokensList <- strsplit(corpus[, text], " ")
  names(tokensList) <- corpus[, code]

  tokensDT <- lapply(tokensList, as.data.table) %>% 
    rbindlist(idcol = TRUE) %>%
        setnames(c("class", "term"))

  tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
    #inverse document frequency smooth  
    idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
    idfDT[, idf :=  log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]    

  tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]

  if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count), by = "class"]
  if(normalize == "log")tfDT[, tf := log(1 +term_count)]
  
  merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
    
}

###########################################################################################################

Libraries version

(data.table(library = libraries, version = librariesVersion))

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] stopwords_2.2     text2vec_0.6      DT_0.18           dplyr_1.0.6      
## [5] rmarkdown_2.8     data.table_1.14.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6           pillar_1.6.0         bslib_0.2.5         
##  [4] compiler_4.0.5       jquerylib_0.1.4      highr_0.9           
##  [7] tools_4.0.5          digest_0.6.27        jsonlite_1.7.2      
## [10] evaluate_0.14        lifecycle_1.0.0      tibble_3.1.1        
## [13] lattice_0.20-41      pkgconfig_2.0.3      rlang_0.4.11        
## [16] Matrix_1.3-2         mlapi_0.1.0          RhpcBLASctl_0.20-137
## [19] yaml_2.2.1           xfun_0.23            stringr_1.4.0       
## [22] knitr_1.33           generics_0.1.0       vctrs_0.3.8         
## [25] sass_0.4.0           htmlwidgets_1.5.3    grid_4.0.5          
## [28] tidyselect_1.1.1     glue_1.4.2           R6_2.5.0            
## [31] fansi_0.4.2          lgr_0.4.2            purrr_0.3.4         
## [34] magrittr_2.0.1       ellipsis_0.3.2       htmltools_0.5.1.1   
## [37] float_0.2-4          rsparse_0.4.0        utf8_1.2.1          
## [40] stringi_1.6.2        crayon_1.4.1

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.09 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Loading data.

ESCO classification

Loading multilingual ESCO classification list.

fileName <- "jobsOutput/skills/skillsListDtClean.rds"
allESCOskillsListClean <- loadBinary(fileName)

Type of data: list.
Number of elements: 27

fileName <- "jobsOutput/skills/cleansedEPASskills.rds"
cleansedEPASSkills <- loadBinary(fileName)

Type of data: data.table, data.frame.
Number of elements: 1761673, 4

timeNeeded <- (proc.time()[3] -  source.starting.time);

01.00.load.data.R completed in 13.99 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Tidying ESCO skills

Process data

The process uses the ESCO classification and the alternative labels to calculate multilingual lookup tables as well as weighted mappings between tokens and ESCO codes.

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type = "FORK")
allESCOskillsTidyList <- parLapply(cl, allESCOskillsListClean, function(data){   
    data[, code := conceptUri ] 
    data[, preferredLabel := trimws(gsub("\\n", " ", preferredLabel))]
    data[, altLabels := trimws(gsub("\\n", " ", altLabels)) %>% keepNTokens(2)]
    data[is.na(altLabels), altLabels := ""]
    data[, text := paste(preferredLabel, altLabels)]
    data[, .(text, code)]
})
stopCluster(cl)

Tokens to Codes mapping

Tokenizing descriptions, removing stopwords and mapping each word to a token.

stopwordsLang <- c(stopwords_getlanguages("snowball"), stopwords_getlanguages("misc"))
escoCountries <- names(allESCOskillsTidyList)
validStopwords <- escoCountries[escoCountries %in% stopwordsLang]

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type = "FORK")
weightedTokensList <- parLapply(cl, seq_along(allESCOskillsTidyList), function(i){
    escoSkills <- allESCOskillsTidyList[[i]]
    lang <- names(allESCOskillsTidyList)[i]
    stopWords <- NULL
    if(lang %in% validStopwords)stopWords <- stopwords(lang)
        findTFIDF(escoSkills, stopWords) %>% 
        setNames( c("word", "code", "word_weight"))
})
stopCluster(cl)
names(weightedTokensList) <- names(allESCOskillsTidyList)

Sorted vocabulary

Sorting the vocabulary with respect to word_weight to reduce computational complexity of the string distance mapping.

sortedVocabularyList <- lapply(weightedTokensList, function(x){
    x[!duplicated(x$word)][order(-word_weight)][ , word]
})
names(sortedVocabularyList) <- names(allESCOskillsTidyList)

timeNeeded <- (proc.time()[3] -  source.starting.time);

02.00.process.data.R completed in 72.71 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Tidying frequency table

processedSkills <- cleansedEPASSkills

Change the name of ‘value’ column to ‘label’.

setnames(processedSkills,"value", "label")

Original free text dimensions 1761673, 4

Keep only non-na free-text.

processedSkills <- processedSkills[!is.na(label)]

Free text dimensions after removing NA values 1761673, 4

Keep only non-duplicated free-text.

processedSkills <- processedSkills[!duplicated(processedSkills)]

Free text dimensions after removing duplicate values 1761673, 4

Set key to free-text dataset

processedSkills[, index := as.character(1:nrow(processedSkills))]

Collect linguistic free-text to be matched.

linguisticText <- processedSkills[grepl("Linguistic", variable)][, .(id, locale, label, index)]

Linguistic dimensions: 946312, 4

Collect other free-text to be matched.

freeText <- processedSkills[!is.na(label)][!grepl("Linguistic", variable)][, .(id, locale, label, index)]

Free-text dimensions: 815361, 4

timeNeeded <- (proc.time()[3] -  source.starting.time);

03.00.process.data.R completed in 8.01 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Persist data

outputRepo <- getSourcePath("jobsOutput/skills/skillsForMatching/")
filename <- "weightedTokensListEscoEpas.rds"
saveBinary(weightedTokensList, filename, outputRepo)

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/weightedTokensListEscoEpas.rds of 32,924,468 bytes.

filename <- "sortedVocabularyListEscoEpas.rds"
saveBinary(sortedVocabularyList, filename, outputRepo)

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/sortedVocabularyListEscoEpas.rds of 1,764,050 bytes.

filename <- "linguisticText.rds"
saveBinary(linguisticText, filename, outputRepo)

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/linguisticText.rds of 14,575,624 bytes.

filename <- "freeText.rds"
saveBinary(freeText, filename, outputRepo)

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/freeText.rds of 68,749,577 bytes.

filename <- "processedSkills.fst"
saveBinary(processedSkills, filename, outputRepo, "fst")

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/processedSkills.fst of 291,034,533 bytes.

timeNeeded <- (proc.time()[3] -  source.starting.time);

04.00.save.data.R completed in 29.33 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 126.13 seconds.

Subpart metrics

reportTabularData(source.blocks);

## NULL