# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries intialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: DT

## Loading required package: stringr

## Loading required package: xml2

## Loading required package: parallel

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 0.49 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
    fileName <- getSourcePath(filename, baseFolder)
    dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
    if(format == "rds") saveRDS(data, fileName)
    if(format == "fst") fst::write_fst(data, fileName)
}

#alternative for rough read write operations
saveRDS_ <- function(object, file){
  dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  saveRDS(object, file)
}

#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data, anonymize=TRUE){

    if(anonymize)return()

    columnsName <- names(data)
    columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    #result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

 
 

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

#read_xml_to_list <- function(filepath, is.gz = FALSE){
#   if(is.gz){  
#       temp_data <- paste0(repository, "data/delete.me")
#       result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
#       Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
#       unlink(temp_data)
#       result
#   }else{
#       xmlToList(xmlParse(filepath))
#   }
#}

#transpose_list_to_dt <- function(data_list){
#  dt <- t(as.data.table(data_list))
#  dt <- as.data.table(dt)
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  names(dt) <- names(data_list[[1]])
#  dt
#}

cleansingCorpus <- function(
    htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
    rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE, 
    tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
    rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
  if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
  if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
  if(rem.newline){text <- gsub("[\r\n\t]", " ", text)} 
  if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
  if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
  if(rem.space){text <- gsub("\\s+", " ", text)}  #removing excess space 
  if(tolower){text <- tolower(text)}
  if(add.space.to.numbers){    #add space between number and letters
    text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
    text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
  }
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)} 
  if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
  if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  trimws(text)
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
  days <-  "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
  months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
  date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
  date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
  text <- gsub(date_form1, " ", text)
  gsub(date_form2, " ", text)
}

xmlToDataTable <- function(xmlData, itemNames){
  itemList <- lapply(itemNames,
    function(x){
      xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
    }
  )
  names(itemList) <- xmlItems
  as.data.table(itemList)
}

cleanCorpusHtml <- function(text){
  unlist(lapply(text, function(x){
    if(nchar(x) > 0){
        # because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description 
        # starts with the main content
        html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated 
        xml <- read_xml(html, as_html = TRUE)
        lis <- xml_find_all(xml, ".//li")
        xml_remove(lis)
        text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
        text <- gsub("\\s+"," ",  text)

    }else {""}
  }))
}

#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
  lim <- length(vct)
  fstep <- lim%/%n_chunks
  idx_list <- list()
  for(i in seq(n_chunks - 1)){
    idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
  }
  idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
  return(idx_list)
}

#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
    vct <- ifelse(vct > threshold, vct, Inf)
    indexVec <- head(order(vct), numHead)
    idVec[indexVec]
}


#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
    xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}

#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
    if(length(result) > 0)
        data.table(elementType = elementType, attributeValue = result)
    else
        data.table()
}

#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
    nodes <- read_xml(paste0("<div>",  feedItem, "</div>"), as_html = TRUE)
    rbind(
        elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
        elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
        elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
        #All "img-src" are NA
    )
}

#retrieve list of parameters in a http request query
getQueryParams <- function(url){
  query <- httr::parse_url(url)$query
  queryValues <- unlist(query)
  queryNames <- names(query)
  dat <- data.table(varName = queryNames, value = queryValues)
  dat[queryValues != ""]
}

keepCountryName <- function(string){
  string <- gsub(".*_", "", string)
  gsub("\\..*", "", string)
}

keepNTokens <- function(string, num){
    tokenList <- strsplit(string, split = " ")
    sapply(tokenList, function(tokens){
        tokens <- sort(tokens)
        tokensShift <- shift(tokens, -num, fill = FALSE)
        paste(tokens[tokens != tokensShift], collapse = " ")    
    })
}

findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {

  tokensList <- strsplit(corpus[, text], " ")
  names(tokensList) <- corpus[, code]

  tokensDT <- lapply(tokensList, as.data.table) %>% 
    rbindlist(idcol = TRUE) %>%
        setnames(c("class", "term"))

  tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
    #inverse document frequency smooth  
    idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
    idfDT[, idf :=  log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]    

  tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]

  if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count), by = "class"]
  if(normalize == "log")tfDT[, tf := log(1 +term_count)]
  
  merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
    
}

###########################################################################################################

Libraries version

(data.table(library = libraries, version = librariesVersion))

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] xml2_1.3.2        stringr_1.4.0     DT_0.18           dplyr_1.0.6      
## [5] rmarkdown_2.8     data.table_1.14.0
## 
## loaded via a namespace (and not attached):
##  [1] knitr_1.33        magrittr_2.0.1    tidyselect_1.1.1  R6_2.5.0         
##  [5] rlang_0.4.11      fansi_0.4.2       highr_0.9         tools_4.0.5      
##  [9] xfun_0.23         utf8_1.2.1        jquerylib_0.1.4   htmltools_0.5.1.1
## [13] ellipsis_0.3.2    yaml_2.2.1        digest_0.6.27     tibble_3.1.1     
## [17] lifecycle_1.0.0   crayon_1.4.1      purrr_0.3.4       htmlwidgets_1.5.3
## [21] sass_0.4.0        vctrs_0.3.8       glue_1.4.2        evaluate_0.14    
## [25] stringi_1.6.2     compiler_4.0.5    bslib_0.2.5       pillar_1.6.0     
## [29] generics_0.1.0    jsonlite_1.7.2    pkgconfig_2.0.3

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.1 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Load data

Retrieving all locale as provided by ESCO and loading corresponding .csv files.

inputWorkingDir <- paste0(repository, "input/skills/")
locales <- list.files(inputWorkingDir) %>% 
    keepCountryName()

Languages: ar, bg, cs, da, de, el, en, es, et, fi, fr, ga, hr, hu, is, it, lt, lv, mt, nl, no, pl, pt, ro, sk, sl, sv.

Loading data per locale.

skillsListDt <- lapply(locales, function(x){ 
    fread(paste0(inputWorkingDir, "skills_", x, ".csv"), na.strings = c("", "NULL"))
})
names(skillsListDt) <- locales

Summary of skills in english

summary(skillsListDt[["en"]])

##  conceptType         conceptUri         skillType          reuseLevel       
##  Length:13485       Length:13485       Length:13485       Length:13485      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  preferredLabel      altLabels         hiddenLabels          status         
##  Length:13485       Length:13485       Length:13485       Length:13485      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   modifiedDate                  scopeNote          definition       
##  Min.   :2016-05-02 16:42:36   Length:13485       Length:13485      
##  1st Qu.:2016-12-20 18:16:02   Class :character   Class :character  
##  Median :2016-12-20 19:35:15   Mode  :character   Mode  :character  
##  Mean   :2017-01-24 08:29:02                                        
##  3rd Qu.:2016-12-20 20:51:29                                        
##  Max.   :2020-06-17 16:07:31                                        
##    inScheme         description       
##  Length:13485       Length:13485      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

hasData <- lapply(skillsListDt, function(data){
    sapply(data, function(x)sum(!is.na(x))/length(x)) %>% 
    data.table %>% 
    transpose
}) %>% rbindlist(idcol = TRUE)

setnames(hasData, names(hasData)[-1], names(skillsListDt[[1]]))

Descriptive statistics for ratio of data completion.

summary(hasData)

##      .id             conceptType   conceptUri   skillType   reuseLevel
##  Length:27          Min.   :1    Min.   :1    Min.   :1   Min.   :1   
##  Class :character   1st Qu.:1    1st Qu.:1    1st Qu.:1   1st Qu.:1   
##  Mode  :character   Median :1    Median :1    Median :1   Median :1   
##                     Mean   :1    Mean   :1    Mean   :1   Mean   :1   
##                     3rd Qu.:1    3rd Qu.:1    3rd Qu.:1   3rd Qu.:1   
##                     Max.   :1    Max.   :1    Max.   :1   Max.   :1   
##  preferredLabel     altLabels         hiddenLabels          status 
##  Min.   :0.9999   Min.   :0.000964   Min.   :0.002373   Min.   :1  
##  1st Qu.:1.0000   1st Qu.:0.051353   1st Qu.:0.002373   1st Qu.:1  
##  Median :1.0000   Median :0.105006   Median :0.002373   Median :1  
##  Mean   :1.0000   Mean   :0.165495   Mean   :0.002398   Mean   :1  
##  3rd Qu.:1.0000   3rd Qu.:0.169188   3rd Qu.:0.002373   3rd Qu.:1  
##  Max.   :1.0000   Max.   :0.962773   Max.   :0.002595   Max.   :1  
##   modifiedDate   scopeNote           definition           inScheme
##  Min.   :1     Min.   :0.0000000   Min.   :0.000e+00   Min.   :1  
##  1st Qu.:1     1st Qu.:0.0000000   1st Qu.:0.000e+00   1st Qu.:1  
##  Median :1     Median :0.0000000   Median :0.000e+00   Median :1  
##  Mean   :1     Mean   :0.0006592   Mean   :8.240e-06   Mean   :1  
##  3rd Qu.:1     3rd Qu.:0.0000000   3rd Qu.:0.000e+00   3rd Qu.:1  
##  Max.   :1     Max.   :0.0177976   Max.   :2.225e-04   Max.   :1  
##   description    
##  Min.   :0.9999  
##  1st Qu.:1.0000  
##  Median :1.0000  
##  Mean   :1.0000  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

timeNeeded <- (proc.time()[3] -  source.starting.time);

1.load.data.R completed in 3.9 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Cleansing Data

Removing columns with low information and cleansing text for each language respectively.

no_cores <- detectCores() - 1 
cl <- makeCluster(no_cores, type = "FORK")
skillsListDtClean <- parLapply(cl, locales, function(x) {
    columnsToKeep <- c("conceptUri", "skillType", "preferredLabel", "altLabels", "description")
    dt <- skillsListDt[[x]][, columnsToKeep, with = FALSE]
    dt[, displayTitle := preferredLabel]
    #cleansing text
    dt[, skillType := gsub(".*/", "", skillType)]
    dt[, conceptUri := gsub(".*/", "", conceptUri)]
    for (j in columnsToKeep[-1]) 
        set(dt, j = j, value = cleansingCorpus(dt[[j]]))
    dt
})
stopCluster(cl)
names(skillsListDtClean) <- locales

Exposing english skills

reportTabularData(data = skillsListDtClean[["en"]][1:1000])

## NULL

timeNeeded <- (proc.time()[3] -  source.starting.time);

2.process.data.R completed in 29.11 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Save data

Saving data in binary form

fileName <- "jobsOutput/skills/skillsListDtClean.rds"
saveBinary(skillsListDtClean, fileName)

Datasource : /data/generic/jobsOutput/skills/skillsListDtClean.rds of 39,646,671 bytes.

Saving data for each language

repo <- "jobsOutput/skills/escoSkillsClean/"
no_answer <- 
    lapply(locales, function(x) {
        name <- paste0(repo, "processed_skills_", x, ".rds")
        saveBinary(skillsListDtClean[[x]], filename  = name, 
        baseFolder = repository)
})

Datasource : /data/generic/jobsOutput/skills/escoSkillsClean/.

timeNeeded <- (proc.time()[3] -  source.starting.time);

3.save.data.R completed in 22.44 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 56.03 seconds.

Subpart metrics

reportTabularData(source.blocks);

## NULL

Get and clean ESCO skills

EPAS

Report generated on

2021-07-07 07:56:04