# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: magrittr

## Loading required package: DT

## Loading required package: text2vec

## Loading required package: stringdist

## 
## Attaching package: 'stringdist'

## The following object is masked from 'package:magrittr':
## 
##     extract

## Loading required package: parallel

## Loading required package: stopwords

timeNeeded <- (proc.time()[3] -  source.starting.time);

../00.core/00.01.libraries.R completed in 1.68 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# EPAS-DS
# @authors ds@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
  return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
  if(length(colClasses) == 0)
    return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
  return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
  fileName <- getSourcePath(filename, baseFolder)
  dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
  if(format == "rds") saveRDS(data, fileName)
  if(format == "fst") fst::write_fst(data, fileName)
}

#alternative for rough read write operations
saveRDS_ <- function(object, file){
  dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  saveRDS(object, file)
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
  return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
  sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
  destinatinoFile <- "/data/tmpfs/results/include.css"
  if (!file.exists(destinatinoFile)) {
    return (file.copy(sourceFile, destinatinoFile))
  }else{
    return(TRUE);
  }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
  return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
  #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
  for(name in names(data)){
    if( class(data[[name]]) =="character"){
      data[[name]] <- as.factor(data[[name]])
    }
  }
  return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

  if(columnName%in% columnsName_original){
    result <- result %>%   formatStyle(
      columnName,
      background = styleColorBar(data[[columnName]], color),
      backgroundSize = '100% 90%',
      backgroundRepeat = 'no-repeat',
      backgroundPosition = 'center'
    )
  }
  return(result)

}


reportTabularData <- function(data, anonymize=TRUE){

  if(anonymize)return()
  
  columnsName <- names(data)
  columnsName <- lapply(columnsName, capitalise)
  columnsName_original <- names(data)

  result <-
    DT::datatable(
      data,
      class = 'cell-border stripe',
      filter = 'top',
      rownames = FALSE,
      colnames = columnsName,
      extensions = 'Buttons',
      options = list(
        pageLength = 20,
        columnDefs = list(list(className = 'dt-left', targets = "_all")),
        dom = 'Bfrtip',
        buttons = c('copy', 'csv', 'excel', 'pdf'),
        searchHighlight = TRUE,
        initComplete = JS(
          "function(settings, json) {",
            "$(this.api().table().header()).css({'border': '1px solid'});",
          "}"
        )
      )

    )


  result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
  result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
  result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
  #result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
  result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

  return(result)
}

#data persistance info
reportFileInfo <- function(filename, baseFolder = repository) {
  paste0(
    getSourcePath(filename, baseFolder), " of size ", 
    utils:::format.object_size(file.size(getSourcePath(filename, outputRepo)) + 1000000, "auto"))
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

#read_xml_to_list <- function(filepath, is.gz = FALSE){
# if(is.gz){  
#     temp_data <- paste0(repository, "data/delete.me")
#     result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
#     Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
#     unlink(temp_data)
#     result
# }else{
#   xmlToList(xmlParse(filepath))
# }
#}

#transpose_list_to_dt <- function(data_list){
#  dt <- t(as.data.table(data_list))
#  dt <- as.data.table(dt)
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  names(dt) <- names(data_list[[1]])
#  dt
#}

cleansingCorpus <- function(
  htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
  rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE, 
  tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
  rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE, fix.greek = TRUE
){
  if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
  if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
  if(rem.newline){text <- gsub("[\r\n\t]", " ", text)} 
  if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
  if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
  if(rem.space){text <- gsub("\\s+", " ", text)}  #removing excess space 
  if(tolower){text <- tolower(text)}
  if(add.space.to.numbers){    #add space between number and letters
    text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
    text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
  }
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)} 
  if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
  if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  if(fix.greek){
    text <- gsub("ς", "σ", text)
    text <- gsub("ά", "α", text)
    text <- gsub("έ", "ε", text)
    text <- gsub("ή", "η", text)
    text <- gsub("ί", "ι", text)
    text <- gsub("ύ", "υ", text)
    text <- gsub("ό", "ο", text)
    text <- gsub("ώ", "ω", text)
  }
  trimws(text)
}

cleansingEducationCorpus <- function(text) {
  text <- gsub("\\.", "", text) #removing periods
  text <- gsub("[[:punct:]]", " ", text) #removing other punctuation
  text <- gsub("\\s+", " ", text) #removing excess space
  text <- tolower(text) #changing case to lower
  #removing accent from Greek
  text <- gsub("ς", "σ", text)
  text <- gsub("ά", "α", text)
  text <- gsub("έ", "ε", text)
  text <- gsub("ή", "η", text)
  text <- gsub("ί", "ι", text)
  text <- gsub("ύ", "υ", text)
  text <- gsub("ό", "ο", text)
  text <- gsub("ώ", "ω", text)
  trimws(text) #trimming white-space
}

#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
  days <-  "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
  months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
  date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
  date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
  text <- gsub(date_form1, " ", text)
  gsub(date_form2, " ", text)
}

xmlToDataTable <- function(xmlData, itemNames){
  itemList <- lapply(itemNames,
    function(x){
      xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
    }
  )
  names(itemList) <- xmlItems
  as.data.table(itemList)
}

cleanCorpusHtml <- function(text){
  unlist(lapply(text, function(x){
    if(nchar(x) > 0){
      # because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description 
      # starts with the main content
    html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated 
    xml <- read_xml(html, as_html = TRUE)
    lis <- xml_find_all(xml, ".//li")
    xml_remove(lis)
    text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
    text <- gsub("\\s+"," ",  text)

    }else {""}
  }))
}

#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
  lim <- length(vct)
  fstep <- lim%/%n_chunks
  idx_list <- list()
  for(i in seq(n_chunks - 1)){
    idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
  }
  idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
  return(idx_list)
}

#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
  vct <- ifelse(vct > threshold, vct, Inf)
    indexVec <- head(order(vct), numHead)
    idVec[indexVec]
}


#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
  xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}

#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
  if(length(result) > 0)
    data.table(elementType = elementType, attributeValue = result)
  else
    data.table()
}

#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
  nodes <- read_xml(paste0("<div>",  feedItem, "</div>"), as_html = TRUE)
  rbind(
    elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
    elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
    elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
    #All "img-src" are NA
  )
}

#retrieve list of parameters in a http request query
getQueryParams <- function(url){
  query <- httr::parse_url(url)$query
  queryValues <- unlist(query)
  queryNames <- names(query)
  dat <- data.table(varName = queryNames, value = queryValues)
  dat[queryValues != ""]
}

#Language detection
detectLanguage <- function(text, precision = 3) {
  if (precision < 1) return(cld3::detect_language(text))
  else if (precision == 2) return(cld2::detect_language(text))
  pred <- cld2::detect_language(text)
  ifelse(pred == cld3::detect_language(text), pred, NA_character_)
}

###################################################################################################
# Functions related to ESCO qualifications scrapping
###################################################################################################

# Returns a vector of the URIs in a particular results page for a given search query
getPageQualificationURIs <- function(locale = "en", eqfLevels = 1:8, pageNum = 1) {
  resultsPageURL <- paste0(
    "https://ec.europa.eu/esco/portal/qualificationSearch?",
    "conceptLanguage=", locale,
    "&searchTerm=",
    "&eqfFilters=", paste(eqfLevels, collapse = ","),
    "&page=", pageNum
  ) %>% url() # for Windows
  resultsHTML <- read_html(resultsPageURL) %>%
    xml_find_all(".//div[@class='content']") %>% 
    xml_children() %>% 
    xml_children() %>% 
    xml_attrs()
  qualificationURIs <- grep(pattern = "http", resultsHTML, value = TRUE)
  uriHead <- regexpr(pattern = "http", qualificationURIs)
  uriTail <- regexpr(pattern = "');", qualificationURIs) - 1
  substr(qualificationURIs, uriHead, uriTail)
}

# Returns total number of pages for a given search query
getNumOfPages <- function(locale = "en", eqfLevels = 1:8) {
  resultsPageURL <- paste0(
    "https://ec.europa.eu/esco/portal/qualificationSearch?",
    "conceptLanguage=", locale,
    "&searchTerm=",
    "&eqfFilters=", paste(eqfLevels, collapse = ","),
    "&page=", 1
  ) %>% url() #for Windows
  numOfQuals <- read_html(resultsPageURL) %>% 
    xml_find_all(".//h1") %>% xml_text() %>% as.numeric()
  qualsPerPage <- length(getPageQualificationURIs())
  round(0.5 + numOfQuals / qualsPerPage)
}

# Returns HTML of a qualification based on its URI for a given locale
getQualificationHTML <- function(uri, locale = "en") {
  paste0(
    "https://ec.europa.eu/esco/portal/qualificationDetails?",
    "conceptLanguage=", locale,
    "&uri=", uri
  ) %>% url() %>% #for Windows
    read_html()
}

# Parses an `xml_nodeset` object to extract useful data
parseXMLNodeSets <- function(xmlNodeSets) {
  lapply(xmlNodeSets, function(x) {
    labelText <- x %>% xml_find_all(".//p[@class='label']") %>% xml_text
    allXml <- x %>% xml_find_all(".//p | .//ul")
    varsIndex <- grep("class=\"label\"", allXml)
    textData <- allXml %>% xml_text
    limits <- c(varsIndex, length(textData) + 1) 
    data <- lapply(seq_along(varsIndex) , function(i) {
      toPaste <- head(limits[i]:limits[i+1], -1)[-1] 
      paste(textData[toPaste], collapse = " ")
    }) %>% as.data.table 
    setnames(data, textData[varsIndex])
    titleText <- x %>% xml_find_all(".//h1") %>% xml_text
    data[, "Title"] <- titleText[1]
    data
  }) %>% rbindlist(fill = TRUE)
}

# Extracts values from text structured in a "Label: Value" format
valuesFromLabeledText <- function(labeledText, label, otherLabels = c()){
  cleanTokens <- labeledText %>%
    paste("ENDLABEL:") %>%
    cleansingCorpus(tolower = FALSE, rem.longwords = FALSE, rem.http = FALSE, rem.nonalphanum = FALSE) %>%
    space_tokenizer()
  label <- gsub("$", ":", label)
  otherLabels <- gsub("$", ":", otherLabels)
  otherLabels <- c("ENDLABEL:", otherLabels[label != otherLabels], label)

  lapply(cleanTokens, function(tokens){
    textStart <- which(tokens %in% label) + 1
    textOther <- which(tokens %in% otherLabels) - 1
    lapply(textStart, function(start){
      fin <- textOther[which(textOther >= start) %>% min]
      tokens[start:fin] %>% paste(collapse = " ")
    }) %>% unlist() %>% paste(collapse = ", ")
  }) %>% unlist() %>% paste0(",")
}

###################################################################################################
# Functions related to TF-IDF calculation
###################################################################################################

# The augumented frequency is used to prevent bias towards longer documents. This choice is 
# justified by the fact that corpus size follows a roughly guassian distribution with respect to 
# EQF level.
# The smooth inverse document frequency is used to prevent IDF from nullifying the TF-IDF in cases 
# where TF can provide useful insigned on its own. That resolves the edge case where a word 
# appears in low frequency in every corpus, but in a significantly high frequency in one corpus.
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1, by.class = "class", threshold = -1) {

  tokensList <- strsplit(corpus[, value], " ")
  names(tokensList) <- corpus[, get(by.class)]

  tokensDT <- lapply(tokensList, as.data.table) %>% 
    rbindlist(idcol = TRUE) %>%
    setnames(c("class", "term"))

  tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
  #inverse document frequency smooth  
  idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
  idfDT[, idf :=  log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]  

  tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
  if(threshold > 0)tfDT[term_count > threshold, term_count := threshold]

  if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count), by = "class"]
  if(normalize == "log")tfDT[, tf := log(1 +term_count)]
  
  merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf, docFreq)]
  
}

#silence warnings pipe
`%W>%` <- function(lhs,rhs){
  w <- options()$warn
  on.exit(options(warn=w))
  options(warn=-1)
  eval.parent(substitute(lhs %>% rhs))
}

getStopwords <- function(locale) { 
  stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball")) 
  stopWords <- ""
  if (locale %in% stopwordsLocale) 
    stopWords <- locale %W>% stopwords
  stopWords
}

###################################################################################################
# Text translation
###################################################################################################

translateText <- function(sourceText, sourceLang, translationLang, batchSize = 4800) {
  if (length(sourceText) == 0) {
    return ("")
  } else if (length(sourceText) == 1) {
    return (requestTranslation(sourceText, sourceLang, translationLang))
  } else if (length(sourceText) > 4800) {
    return (NA)
  }

  sourceQueries <- gsub("$", "\n >", sourceText)
  sourceQueries <- gsub("^", "< \n", sourceQueries)
  queries <- data.table(query = sourceQueries, size = nchar(sourceQueries), batch = nchar(sourceQueries))
  for (row in seq(nrow(queries) - 1)) {
    cumulativeSum <- queries[row, batch] + queries[row + 1, batch] + 3
    queries[row + 1, batch := ifelse(cumulativeSum > batchSize, batch, cumulativeSum)]
  }
  batchStarts <- which(queries[, size] == queries[, batch])
  batchFins <- c(batchStarts[-1] - 1, nrow(queries))
  batches <- lapply(seq_along(batchStarts), function(i) batchStarts[i]:batchFins[i])
  pastedQueries <- lapply(batches, function(batch) paste0(queries[batch, query], collapse = "\n")) %>% unlist()

  translatedText <- lapply(pastedQueries, requestTranslation, sourceLang, translationLang) %>% 
    unlist() %>%
    paste0(collapse = " ")
  translatedText <- gsub("\\s?<", "", translatedText)
  gsub(">$", "", translatedText) %>% 
    space_tokenizer(sep = ">") %>% 
    unlist() %>% 
    trimws()
}

requestTranslation <- function(sourceText, sourceLang, translationLang) { 
  googleTranslateURL <- paste0(
    "https://translate.google.com/m",
    "?hl=", sourceLang,
    "&sl=", sourceLang,
    "&tl=", translationLang,
    "&ie=UTF-8&prev=_m&q=", URLencode(sourceText, reserved = TRUE)
  )
  GET(googleTranslateURL, add_headers("user-agent" = "Mozilla/5.0")) %>%
    read_html() %>% 
    xml_child(2) %>%
    xml_child(5) %>%
    xml_text() %>%
    unlist()
}

###################################################################################################
# Text mining
###################################################################################################

findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
  ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
    create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
    as.data.table()
  ngrams[order(-doc_count)][, .(term, count = doc_count)]
}

###########################################################################################################

Libraries version

(data.table(library = libraries, version = librariesVersion))

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] stopwords_2.2      stringdist_0.9.6.3 text2vec_0.6       DT_0.18           
## [5] magrittr_2.0.1     rmarkdown_2.8      data.table_1.14.0 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6           mlapi_0.1.0          knitr_1.33          
##  [4] RhpcBLASctl_0.20-137 float_0.2-4          lattice_0.20-41     
##  [7] R6_2.5.0             rlang_0.4.11         lgr_0.4.2           
## [10] stringr_1.4.0        highr_0.9            tools_4.0.5         
## [13] grid_4.0.5           xfun_0.23            jquerylib_0.1.4     
## [16] htmltools_0.5.1.1    yaml_2.2.1           digest_0.6.27       
## [19] rsparse_0.4.0        crayon_1.4.1         Matrix_1.3-2        
## [22] vctrs_0.3.8          sass_0.4.0           htmlwidgets_1.5.3   
## [25] evaluate_0.14        stringi_1.6.2        compiler_4.0.5      
## [28] bslib_0.2.5          jsonlite_1.7.2

timeNeeded <- (proc.time()[3] -  source.starting.time);

../00.core/00.02.base.functions.R completed in 0.15 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Data loading

inputRepo <- getSourcePath("jobsOutput/")

Loading multilingual Education CVs

filename <- "education/data/educationFeatures.rds"
educationFeatures <- loadBinary(filename, inputRepo)
educationFeatures[, index := as.character(index)]

Type of data: data.table, data.frame.
Dimensions: 997068, 19.
Column Names: id, locale, eqfLevelCode, eqfLevelLabel, organisationCountry, organisation, from, to, title, index, birthYear, curAge, enrolAge, graduAge, studyAge, length, cumLength, status, numQual.

Loading Level matches

filename <- "education/data/suggestedLevels.rds"
suggestedLevels <- loadBinary(filename, inputRepo)

Type of data: data.table, data.frame.
Dimensions: 3395165, 5.
Column Names: index, level, total_weight, voc_matches, level_matches.

Loading Qualification matches

filename <- "education/data/suggestedQualifications.rds"
suggestedQualifications <- loadBinary(filename, inputRepo)

Type of data: data.table, data.frame.
Dimensions: 21770484, 6.
Column Names: index, level, code, total_weight, voc_matches, code_matches.

timeNeeded <- (proc.time()[3] -  source.starting.time);

1.load.data.R completed in 33.55 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Data processing

In the previous process we arrived to suggestions for the EQF level and qualification of each free text of each CV. Specifically, the relative probability of each qualification belonging to each EQF level and each qualification has been calculated. It will be further processed to arrive to a final conclusion.

Joining data

educationSuggestions <- merge(educationFeatures, suggestedQualifications[, .(index, level, weight = total_weight)], by = "index", allow.cartesian = TRUE)
suggestedLevels[, level := as.numeric(level)]
educationSuggestions <- merge(educationSuggestions, suggestedLevels[, .(index, level, level_weight = total_weight)], by = c("index", "level"))
educationSuggestions[, weight := weight * level_weight]
educationSuggestions <- educationSuggestions[order(index, -weight)]
educationSuggestions[, indexLevel := paste0(index, "_", level)]
educationSuggestions <- educationSuggestions[!duplicated(indexLevel)][, indexLevel := NULL]

Filtering qualifications by enrollment and graduation age.

educationSuggestions[level == 1 & enrolAge < 6, weight := 1.1**(enrolAge - 6) * weight]
educationSuggestions[level > 1 & level < 4 & studyAge < 6, weight := 1.1**(studyAge - 6) * weight]
educationSuggestions[level > 1 & level < 4 & studyAge > 18, weight := 1.1**(18 - studyAge) * weight]
educationSuggestions[level == 4 & graduAge > 18, weight := 1.1**(18 - graduAge) * weight]
educationSuggestions[level == 5 & enrolAge < 18, weight := 1.1**(enrolAge - 18) * weight]
educationSuggestions[level == 5 & graduAge < 20, weight := 1.1**(graduAge - 20) * weight]
educationSuggestions[level == 6 & enrolAge < 18, weight := 1.1**(enrolAge - 18) * weight]
educationSuggestions[level == 6 & graduAge < 22, weight := 1.1**(graduAge - 22) * weight]
educationSuggestions[level == 7 & enrolAge < 22, weight := 1.1**(enrolAge - 22) * weight]
educationSuggestions[level == 7 & graduAge < 23, weight := 1.1**(graduAge - 23) * weight]
educationSuggestions[level == 8 & enrolAge < 23, weight := 2.1**(enrolAge - 23) * weight]
educationSuggestions[level == 8 & graduAge < 26, weight := 2.1**(graduAge - 26) * weight]

Filtering qualifications by length of completion.

educationSuggestions[level == 1 & length < 4, weight := 1.1**(length - 4) * weight]
educationSuggestions[level == 2 & length < 2, weight := 1.1**(length - 2) * weight]
educationSuggestions[level == 3 & length > 2, weight := 1.1**(2 - length) * weight]
educationSuggestions[level == 4 & length < 3, weight := 1.1**(length - 3) * weight]
educationSuggestions[level == 5 & length > 3, weight := 1.1**(3 - length) * weight]
educationSuggestions[level == 6 & length < 3, weight := 1.1**(length - 3) * weight]
educationSuggestions[level == 7 & length > 2, weight := 1.1**(2 - length) * weight]
educationSuggestions[level == 8 & length < 4, weight := 1.1**(length - 4) * weight]

Filtering qualifications by current age of holder.

educationSuggestions[level == 8 & status == "Complete" & curAge < 26, weight := 2.1**(curAge - 26) * weight]
educationSuggestions[level == 7 & status == "Complete" & curAge < 22, weight := 2.1**(curAge - 22) * weight]
educationSuggestions[level == 6 & status == "Complete" & curAge < 21, weight := 2.1**(curAge - 21) * weight]
educationSuggestions[level == 5 & status == "Complete" & curAge < 20, weight := 2.1**(curAge - 20) * weight]

Keeping top suggestion.

educationSuggestions <- educationSuggestions[!duplicated(index)]

Keeping upper 90% quantile

educationSuggestions <- educationSuggestions[level_weight > quantile(level_weight, 0.1)]

Removing mismatches and filtering out likely certifications, courses and other qualification entries not equivalent to EQF levels.

educationSuggestions[level == 1 & enrolAge > 8, level := NA]
educationSuggestions[level == 2 & enrolAge > 14, level := NA]
educationSuggestions[level == 3 & enrolAge > 16, level := NA]
educationSuggestions[level == 4 & enrolAge > 16, level := NA]
educationSuggestions[level == 5 & graduAge < 18, level := NA]
educationSuggestions[level == 6 & graduAge < 20, level := NA]
educationSuggestions[level == 7 & graduAge < 21, level := NA]
educationSuggestions[level == 8 & graduAge < 23, level := NA]

Filtering out likely certifications and short courses.

educationSuggestions[status == "One-time", level := NA]

Utilizing drop-down EQF level.

cvsWithDropDown <- educationSuggestions[!is.na(eqfLevelCode), id] %>% unique()
educationSuggestions[level != eqfLevelCode, level := eqfLevelCode]
#educationSuggestions[id %in% cvsWithDropDown & is.na(eqfLevelCode), level := NA]

Accounting for ongoing studies.

cvSuggestionsOngoing <- educationSuggestions[!is.na(level) & status == "Incomplete", .(ongoing_level = max(level), ongoing_weight = level_weight), by = "id"][order(-ongoing_weight)][!duplicated(id)]
cvSuggestionsComplete <- educationSuggestions[!is.na(level) & (status != "Incomplete" | is.na(status)), .(complete_level = max(level), complete_weight = level_weight), by = "id"][order(-complete_weight)][!duplicated(id)]
cvSuggestions <- merge(cvSuggestionsOngoing, cvSuggestionsComplete, by = "id", all = TRUE)
cvSuggestions[complete_level >= ongoing_level & complete_weight > ongoing_weight, ongoing_level := NA]
cvSuggestions[complete_level >= ongoing_level & complete_weight <= ongoing_weight, complete_level := NA]
cvSuggestions[complete_level <= 3 & ongoing_level > 4 & complete_weight > ongoing_weight, ongoing_level := NA]
cvSuggestions[complete_level <= 3 & ongoing_level > 4 & complete_weight <= ongoing_weight, complete_level := NA]
cvSuggestions[complete_level <= 5 & ongoing_level > 6 & complete_weight > ongoing_weight, ongoing_level := NA]
cvSuggestions[complete_level <= 5 & ongoing_level > 6 & complete_weight <= ongoing_weight, complete_level := NA]

Determining level for each CV and qualification.

cvSuggestions <- merge(cvSuggestions, educationFeatures[!duplicated(id), .(id)], by = "id", all = TRUE)
cvSuggestions[educationFeatures[status == "Incomplete"][!duplicated(id)], on = "id", student := TRUE]

qualSuggestions <- educationSuggestions[, .(index, level = level)]

timeNeeded <- (proc.time()[3] -  source.starting.time);

2.process.data.R completed in 60.38 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Data persistance

outputRepo <- getSourcePath("jobsOutput/education/predictions/")

filename <- "qualificationPredictions.rds"
saveBinary(qualSuggestions, filename, outputRepo)

Datasource : /data/generic/jobsOutput/education/predictions/qualificationPredictions.rds of size 3.1 Mb.

Exposing data

reportTabularData(head(qualSuggestions, 30))

## NULL

filename <- "cvPredictions.rds"
saveBinary(cvSuggestions, filename, outputRepo)

Datasource : /data/generic/jobsOutput/education/predictions/cvPredictions.rds of size 11.5 Mb.

Exposing data

reportTabularData(head(cvSuggestions, 30))

## NULL

timeNeeded <- (proc.time()[3] -  source.starting.time);

3.save.data.R completed in 3.02 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 98.78 seconds.

Subpart metrics

reportTabularData(source.blocks);

## NULL

Qualifications Matching (Weighted)

Voting algorithm

Report generated on

2021-07-05