Free-Text Skills Matching (Weighted)

EPAS database

Report generated on

2021-07-07

Abstract

The process of skills identification has two major phases. The matching method is used to identify a free text word in the ESCO vocabulary. The process is language agnostic and the languages used are those in ESCO classification for skills. The large amount of data, the multilingual nature of the problem and the high computational complexity demands scalable solutions. A low-level implementation in C is used for matching and all intensive calculations are parallelized.

# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: magrittr

## Loading required package: DT

## Loading required package: text2vec

## Loading required package: stringdist

## 
## Attaching package: 'stringdist'

## The following object is masked from 'package:magrittr':
## 
##     extract

## Loading required package: parallel

## Loading required package: stopwords

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 1.68 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
    fileName <- getSourcePath(filename, baseFolder)
    dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
    if(format == "rds") saveRDS(data, fileName)
    if(format == "fst") fst::write_fst(data, fileName)
}

#alternative for rough read write operations
saveRDS_ <- function(object, file){
  dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  saveRDS(object, file)
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data, anonymize=TRUE){

    if(anonymize)return()

    columnsName <- names(data)
    columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    #result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

 
 

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

#read_xml_to_list <- function(filepath, is.gz = FALSE){
#   if(is.gz){  
#       temp_data <- paste0(repository, "data/delete.me")
#       result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
#       Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
#       unlink(temp_data)
#       result
#   }else{
#       xmlToList(xmlParse(filepath))
#   }
#}

#transpose_list_to_dt <- function(data_list){
#  dt <- t(as.data.table(data_list))
#  dt <- as.data.table(dt)
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  names(dt) <- names(data_list[[1]])
#  dt
#}

cleansingCorpus <- function(
    htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
    rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE, 
    tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
    rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
  if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
  if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
  if(rem.newline){text <- gsub("[\r\n\t]", " ", text)} 
  if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
  if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
  if(rem.space){text <- gsub("\\s+", " ", text)}  #removing excess space 
  if(tolower){text <- tolower(text)}
  if(add.space.to.numbers){    #add space between number and letters
    text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
    text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
  }
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)} 
  if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
  if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  trimws(text)
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
  days <-  "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
  months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
  date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
  date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
  text <- gsub(date_form1, " ", text)
  gsub(date_form2, " ", text)
}

xmlToDataTable <- function(xmlData, itemNames){
  itemList <- lapply(itemNames,
    function(x){
      xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
    }
  )
  names(itemList) <- xmlItems
  as.data.table(itemList)
}

cleanCorpusHtml <- function(text){
  unlist(lapply(text, function(x){
    if(nchar(x) > 0){
        # because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description 
        # starts with the main content
        html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated 
        xml <- read_xml(html, as_html = TRUE)
        lis <- xml_find_all(xml, ".//li")
        xml_remove(lis)
        text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
        text <- gsub("\\s+"," ",  text)

    }else {""}
  }))
}

#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
  lim <- length(vct)
  fstep <- lim%/%n_chunks
  idx_list <- list()
  for(i in seq(n_chunks - 1)){
    idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
  }
  idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
  return(idx_list)
}

#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
    vct <- ifelse(vct > threshold, vct, Inf)
    indexVec <- head(order(vct), numHead)
    idVec[indexVec]
}


#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
    xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}

#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
    if(length(result) > 0)
        data.table(elementType = elementType, attributeValue = result)
    else
        data.table()
}

#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
    nodes <- read_xml(paste0("<div>",  feedItem, "</div>"), as_html = TRUE)
    rbind(
        elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
        elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
        elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
        #All "img-src" are NA
    )
}

#retrieve list of parameters in a http request query
getQueryParams <- function(url){
  query <- httr::parse_url(url)$query
  queryValues <- unlist(query)
  queryNames <- names(query)
  dat <- data.table(varName = queryNames, value = queryValues)
  dat[queryValues != ""]
}

keepCountryName <- function(string){
  string <- gsub(".*_", "", string)
  gsub("\\..*", "", string)
}

keepNTokens <- function(string, num){
    tokenList <- strsplit(string, split = " ")
    sapply(tokenList, function(tokens){
        tokens <- sort(tokens)
        tokensShift <- shift(tokens, -num, fill = FALSE)
        paste(tokens[tokens != tokensShift], collapse = " ")    
    })
}

findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {

  tokensList <- strsplit(corpus[, text], " ")
  names(tokensList) <- corpus[, code]

  tokensDT <- lapply(tokensList, as.data.table) %>% 
    rbindlist(idcol = TRUE) %>%
        setnames(c("class", "term"))

  tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
    #inverse document frequency smooth  
    idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
    idfDT[, idf :=  log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]    

  tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]

  if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count), by = "class"]
  if(normalize == "log")tfDT[, tf := log(1 +term_count)]
  
  merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
    
}

###########################################################################################################

Libraries version

(data.table(library = libraries, version = librariesVersion))

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] stopwords_2.2      stringdist_0.9.6.3 text2vec_0.6       DT_0.18           
## [5] magrittr_2.0.1     rmarkdown_2.8      data.table_1.14.0 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6           mlapi_0.1.0          knitr_1.33          
##  [4] RhpcBLASctl_0.20-137 float_0.2-4          lattice_0.20-41     
##  [7] R6_2.5.0             rlang_0.4.11         lgr_0.4.2           
## [10] stringr_1.4.0        highr_0.9            tools_4.0.5         
## [13] grid_4.0.5           xfun_0.23            jquerylib_0.1.4     
## [16] htmltools_0.5.1.1    yaml_2.2.1           digest_0.6.27       
## [19] rsparse_0.4.0        crayon_1.4.1         Matrix_1.3-2        
## [22] vctrs_0.3.8          sass_0.4.0           htmlwidgets_1.5.3   
## [25] evaluate_0.14        stringi_1.6.2        compiler_4.0.5      
## [28] bslib_0.2.5          jsonlite_1.7.2

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.12 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Load data.

Loading ESCO word tokens.

inputRepo <- getSourcePath("jobsOutput/skills/skillsForMatching/")
filename <- "weightedTokensListEscoEpas.rds"
weightedTokensList <- loadBinary(filename, inputRepo)

Type of data: list.
Dimensions: ****.
Column Names: ar, bg, cs, da, de, el, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.

Loading multilingual ESCO vocabulary.

filename <- "sortedVocabularyListEscoEpas.rds"
sortedVocabularyList <- loadBinary(filename, inputRepo)

Type of data: list.
Dimensions: ****.
Column Names: ar, bg, cs, da, de, el, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.

Loading free-text.

filename <- "freeText.rds"
freeText <- loadBinary(filename, inputRepo)

Type of data: data.table, data.frame.
Dimensions: 815361, 4.
Column Names: id, locale, label, index.

Loading multilingual ESCO skills list.

filename <- "../skillsListDtClean.rds"
skillsListDtClean <- loadBinary(filename, inputRepo)

Type of data: list.
Dimensions: ****.
Column Names: ar, bg, cs, da, de, el, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.

timeNeeded <- (proc.time()[3] -  source.starting.time);

01.00.load.data.R completed in 15.62 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Tokenisation

Tokenising free text entries for each language with multi-threading.

langInFreeText <- freeText$locale %>% unique
langInEsco <- names(weightedTokensList)
languages <- langInFreeText[langInFreeText %in% langInEsco]

freeTextPerLang <- lapply(languages, function(x)freeText[locale == x])
names(freeTextPerLang) <- languages

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type = "FORK")
freeTextTokenList <- parLapply(cl, freeTextPerLang, function(x){
    wordTokensUnknown <- word_tokenizer(x$label)
    names(wordTokensUnknown) <- paste0(x$index, "_")
    res <- wordTokensUnknown %>% unlist
    res #[nchar(res) > 2]
})
stopCluster(cl)

Weight threshold

Discarding 8% of lowest weights.

weightedTokensList <- lapply(weightedTokensList, function(x){
    threshold <- quantile(x$word_weight, thr)
    x[word_weight > threshold]
})

timeNeeded <- (proc.time()[3] -  source.starting.time);

02.00.process.data.R completed in 88.99 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

String Distance Identifier

This process maps words retrieved from free-text to vocabulary terms of the EPAS backend. The Optimal String Alignment distance is used that allows more types of edit operations. These computations are expensive so a low level interface to C with multithreading is used.

Preparing vocabulary

Mapping free-text to EPAS vocabulary.

stopwordsLang <- c(stopwords_getlanguages("snowball"), stopwords_getlanguages("misc"))

getMatches <- lapply(seq_along(freeTextTokenList), function(i){
    language <- names(freeTextTokenList)[i]
    voca <- sortedVocabularyList[[language]]
    freeTokens <- freeTextTokenList[[i]]
    if(language %in% stopwordsLang) freeTokens <- freeTokens[!freeTokens %in% stopwords(language)]
    vocaIndexes <- match(freeTokens, voca)
    res <- data.table(index = gsub("_.*", "", names(freeTokens)), word =  voca[vocaIndexes])
    res <- res[!is.na(index)][!is.na(word)]
    res <- res[!duplicated(res)]
})

## Warning: 'stopwords(language = "el")' is deprecated.
## Use 'stopwords(language = "el", source = "misc")' instead.
## See help("Deprecated")

names(getMatches) <- names(freeTextTokenList)

Split the matches in multiple chunks to reduce space complexity.

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
indexChunks <-parLapply(cl, getMatches, function(matches){
    indexList <- split(unique(matches$index), 1:100)
    lapply(indexList, function(indexes){
        which(matches$index %in% indexes)
    }) 
})
stopCluster(cl)

Use the matches to find the top 20 recommended ESCO skills for each free-text skill entry.

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
clusterExport(cl, varlist=c("getMatches", "weightedTokensList", "indexChunks", "numReco"))
mergeLangData <- parLapply(cl, languages, function(lang){
    library(data.table)
    getMatchesLang <- getMatches[[lang]]
    weightTokens <- weightedTokensList[[lang]]     
    res <- lapply(indexChunks[[lang]], function(indexes){
        getMatchesLangChunk <- getMatchesLang[indexes] 
        dat <- weightTokens[getMatchesLangChunk, on = "word", allow.cartesian = TRUE]
        dat <- dat[!is.na(code)]
        dat <- dat[!duplicated(dat)]
        dat[ , .(total_weight = sum(word_weight)), by = c("code", "index")][order(index, -total_weight)][, head(.SD, numReco), by = "index"][ , .(index, code, total_weight)]
    }) 
    rbindlist(res)
})
stopCluster(cl)

Join free-text and predicted skills.

freeIdToCode <- mergeLangData %>% rbindlist
freeTextPredictions <- merge(freeText, freeIdToCode, on = "index", all = TRUE)

Add the english skill title to each predicted skill.

skillsESCO <- skillsListDtClean[["en"]]
freeTextPredictions[, skillTitle :=  skillsESCO$displayTitle[match(code, skillsESCO$conceptUri)]]

Percentage of matching: 0.9767428.

timeNeeded <- (proc.time()[3] -  source.starting.time);

03.00.process.data.R completed in 438.94 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Persist data

Saving suggeted codes for free text entries.

outputRepo <- getSourcePath("jobsOutput/skills/skillsForMatching/") 
filename <- "freeTextPredictions.rds"
saveBinary(freeTextPredictions, filename, outputRepo)

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/freeTextPredictions.rds of 503,657,148 bytes.

filename <- "weightedTokensUsed.rds"
saveBinary(weightedTokensList, filename, outputRepo)

Datasource : /data/generic/jobsOutput/skills/skillsForMatching/weightedTokensUsed.rds of 30,152,132 bytes.

timeNeeded <- (proc.time()[3] -  source.starting.time);

04.00.save.data.R completed in 173.25 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 718.6 seconds.

Subpart metrics

reportTabularData(source.blocks);

## NULL