# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]

Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.

Environment

R version

R.Version()$version.string

## [1] "R version 4.0.5 (2021-03-31)"

Libraries initialisation

librariesVersion <- c()

for(i in 1:length(libraries))
    librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))

librariesLoaded <- lapply(libraries, require, character.only = TRUE)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: DT

## Loading required package: stringr

## Loading required package: magrittr

## Loading required package: ISOcodes

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.01.libraries.R completed in 0.48 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


## Base functions

# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"

getSourcePath <- function(filename, baseFolder = repository){
    return(paste(baseFolder, filename, sep = ""))
}

readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
    if(length(colClasses) == 0)
        return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
    return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE,   na.strings = na.strings )  )
}

#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
  fileName <- getSourcePath(filename, baseFolder)
  dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
  if(format == "rds") saveRDS(data, fileName)
  if(format == "fst") fst::write_fst(data, fileName)
}


#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
  if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
  if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}

rowColumns <- function(data){
    return(paste( format(nrow(data),  big.mark=","), "Rows X ", ncol(data), "Columns"))
}

publishIncludeCss <- function(){
    sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
    destinatinoFile <- "/data/tmpfs/results/include.css"
    if (!file.exists(destinatinoFile)) {
        return (file.copy(sourceFile, destinatinoFile))
    }else{
        return(TRUE);
    }

}
#as the mountstorage is on memory make sure the asset include.css is there.

summariseTable <- function(data){
    return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
    #return(do.call(cbind, lapply(data, summary)))
}

factoriseCharacterColumns <- function(data){
    for(name in names(data)){
        if( class(data[[name]]) =="character"){
            data[[name]] <- as.factor(data[[name]])
        }
    }
    return(data) 
}

codeBook <- function(dataset){

  out <- lapply(names(dataset), function(var_name) {
    knitr::knit_expand(text = readLines("../000.core/codeBook.template"))
  })
  
  cat(
    knitr::knit(
      text = unlist(paste(out, collapse = '\n')), 
      quiet = TRUE)
    )

}

fwrite_zip <- function(data, filename, quote = TRUE){
  dir.create(dirname(filename), recursive = TRUE, showWarnings = FALSE)
  filename_csv <- strsplit(filename, "/") %>% unlist %>% tail(1)
  filename_csv <- gsub(".zip", ".csv", filename_csv)
  fwrite(data, filename_csv, quote = quote)
  if(file.exists(filename))unlink(filename)
  zip(filename, filename_csv)
  unlink(filename_csv)
}

############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown

capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))


styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){

    if(columnName%in% columnsName_original){
        result <- result %>%   formatStyle(
            columnName,
            background = styleColorBar(data[[columnName]], color),
            backgroundSize = '100% 90%',
            backgroundRepeat = 'no-repeat',
            backgroundPosition = 'center'
        )
    }
    return(result)

}


reportTabularData <- function(data){
    
    columnsName <- names(data)
    columnsName <- lapply(columnsName, capitalise)
    columnsName_original <- names(data)

    result <-
        DT::datatable(
            data,
            class = 'cell-border stripe',
            filter = 'top',
            rownames = FALSE,
            colnames = columnsName,
            extensions = 'Buttons',
            options = list(
                pageLength = 20,
                columnDefs = list(list(className = 'dt-left', targets = "_all")),
                dom = 'Bfrtip',
                buttons = c('copy', 'csv', 'excel', 'pdf'),
                searchHighlight = TRUE,
                initComplete = JS(
                    "function(settings, json) {",
                        "$(this.api().table().header()).css({'border': '1px solid'});",
                    "}"
                )
            )

        )


    result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
    #result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
    result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)

 
 

    return(result)
}

fonts <- list(
 sans = "DejaVu Serif",
  mono = "DejaVu Serif",
  `Times New Roman` = "DejaVu Serif"
)

#read_xml_to_list <- function(filepath, is.gz = FALSE){
#   if(is.gz){  
#       temp_data <- paste0(repository, "data/delete.me")
#       result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
#       Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
#       unlink(temp_data)
#       result
#   }else{
#       xmlToList(xmlParse(filepath))
#   }
#}

#transpose_list_to_dt <- function(data_list){
#  dt <- t(as.data.table(data_list))
#  dt <- as.data.table(dt)
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
#  names(dt) <- names(data_list[[1]])
#  dt
#}

cleansingCorpus <- function(
    htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
    rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE, 
    tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
    rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
  if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
  if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
  if(rem.newline){text <- gsub("[\r\n\t]", " ", text)} 
  if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
  if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
  if(rem.space){text <- gsub("\\s+", " ", text)}  #removing excess space 
  if(tolower){text <- tolower(text)}
  if(add.space.to.numbers){    #add space between number and letters
    text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
    text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
  }
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)} 
  if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
  if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  trimws(text)
}

cleansingEducationCorpus <- function(text) {
  text <- gsub("\\.", "", text) #removing periods
  text <- gsub("[[:punct:]]", " ", text) #removing other punctuation
  text <- gsub("\\s+", " ", text) #removing excess space
  text <- tolower(text) #changing case to lower
  #removing accent from Greek
  text <- gsub("ς", "σ", text)
  text <- gsub("ά", "α", text)
  text <- gsub("έ", "ε", text)
  text <- gsub("ή", "η", text)
  text <- gsub("ί", "ι", text)
  text <- gsub("ύ", "υ", text)
  text <- gsub("ό", "ο", text)
  text <- gsub("ώ", "ω", text)
  trimws(text) #trimming white-space
}

#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
  days <-  "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
  months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
  date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
  date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
  text <- gsub(date_form1, " ", text)
  gsub(date_form2, " ", text)
}

xmlToDataTable <- function(xmlData, itemNames){
  itemList <- lapply(itemNames,
    function(x){
      xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
    }
  )
  names(itemList) <- xmlItems
  as.data.table(itemList)
}

cleanCorpusHtml <- function(text){
  unlist(lapply(text, function(x){
    if(nchar(x) > 0){
        # because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description 
        # starts with the main content
        html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated 
        xml <- read_xml(html, as_html = TRUE)
        lis <- xml_find_all(xml, ".//li")
        xml_remove(lis)
        text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
        text <- gsub("\\s+"," ",  text)

    }else {""}
  }))
}

#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
  lim <- length(vct)
  fstep <- lim%/%n_chunks
  idx_list <- list()
  for(i in seq(n_chunks - 1)){
    idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
  }
  idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
  return(idx_list)
}

#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
    vct <- ifelse(vct > threshold, vct, Inf)
    indexVec <- head(order(vct), numHead)
    idVec[indexVec]
}


#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
    xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}

#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
    if(length(result) > 0)
        data.table(elementType = elementType, attributeValue = result)
    else
        data.table()
}

#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
    nodes <- read_xml(paste0("<div>",  feedItem, "</div>"), as_html = TRUE)
    rbind(
        elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
        elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
        elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
        #All "img-src" are NA
    )
}

#retrieve list of parameters in a http request query
getQueryParams <- function(url){
  query <- httr::parse_url(url)$query
  queryValues <- unlist(query)
  queryNames <- names(query)
  dat <- data.table(varName = queryNames, value = queryValues)
  dat[queryValues != ""]
}

keepCountryName <- function(string){
  string <- gsub(".*_", "", string)
  gsub("\\..*", "", string)
}

keepNTokens <- function(string, num){
    tokenList <- strsplit(string, split = " ")
    sapply(tokenList, function(tokens){
        tokens <- sort(tokens)
        tokensShift <- shift(tokens, -num, fill = FALSE)
        paste(tokens[tokens != tokensShift], collapse = " ")    
    })
}

findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {

  tokensList <- strsplit(corpus[, text], " ")
  names(tokensList) <- corpus[, code]

  tokensDT <- lapply(tokensList, as.data.table) %>% 
    rbindlist(idcol = TRUE) %>%
        setnames(c("class", "term"))

  tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
    #inverse document frequency smooth  
    idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
    idfDT[, idf :=  log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]    

  tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]

  if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
  if(normalize == "log")tfDT[, tf := log(1 +term_count)]
  
  merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
    
}

tidyJsonData <- function(jsonList){
    if(length(jsonList) == 0)return(NULL)
    unlistOccupations <- jsonList %>% unlist
    codesMaleFemale <- names(unlistOccupations)
    epasMapping <- data.table(unlistOccupations)
    epasMapping[ , code := gsub("\\.[[:alpha:]]$", "", codesMaleFemale)]
    uniqueMappingBoolean <- epasMapping[ , unlistOccupations != c(unlistOccupations[-1], F), by = code]$V1
    codesEpasDB <- epasMapping[uniqueMappingBoolean]
    names(codesEpasDB) <- c("title", "code")
    codesEpasDB[ , title := cleansingCorpus(title)]
}

findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
  ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
    create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
    as.data.table()
  ngrams[order(-doc_count)][, .(term, count = doc_count)]
}

`%W>%` <- function(lhs,rhs){
  w <- options()$warn
  on.exit(options(warn=w))
  options(warn=-1)
  eval.parent(substitute(lhs %>% rhs))
}

getStopwords <- function(locale) { 
  stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball")) 
  stopWords <- ""
  if (locale %in% stopwordsLocale) 
    stopWords <- locale %W>% stopwords
  stopWords  
}

###################################################################################################
# Text translation
###################################################################################################

translateText <- function(sourceText, sourceLang, translationLang, batchSize = 4800) {
  if (length(sourceText) == 0) {
    return ("")
  } else if (length(sourceText) == 1) {
    return (requestTranslation(sourceText, sourceLang, translationLang))
  } else if (length(sourceText) > 4800) {
    return (NA)
  }

  sourceQueries <- gsub("$", "\n >", sourceText)
  sourceQueries <- gsub("^", "< \n", sourceQueries)
  queries <- data.table(query = sourceQueries, size = nchar(sourceQueries), batch = nchar(sourceQueries))
  for (row in seq(nrow(queries) - 1)) {
    cumulativeSum <- queries[row, batch] + queries[row + 1, batch] + 3
    queries[row + 1, batch := ifelse(cumulativeSum > batchSize, batch, cumulativeSum)]
  }
  batchStarts <- which(queries[, size] == queries[, batch])
  batchFins <- c(batchStarts[-1] - 1, nrow(queries))
  batches <- lapply(seq_along(batchStarts), function(i) batchStarts[i]:batchFins[i])
  pastedQueries <- lapply(batches, function(batch) paste0(queries[batch, query], collapse = "\n")) %>% unlist()

  translatedText <- lapply(pastedQueries, requestTranslation, sourceLang, translationLang) %>% 
    unlist() %>%
    paste0(collapse = " ")
  translatedText <- gsub("\\s?<", "", translatedText)
  gsub(">$", "", translatedText) %>% 
    space_tokenizer(sep = ">") %>% 
    unlist() %>% 
    trimws()
}

requestTranslation <- function(sourceText, sourceLang, translationLang) { 
  googleTranslateURL <- paste0(
    "https://translate.google.com/m",
    "?hl=", sourceLang,
    "&sl=", sourceLang,
    "&tl=", translationLang,
    "&ie=UTF-8&prev=_m&q=", URLencode(sourceText, reserved = TRUE)
  )
  GET(googleTranslateURL, add_headers("user-agent" = "Mozilla/5.0")) %>%
    read_html() %>% 
    xml_child(2) %>%
    xml_child(5) %>%
    xml_text() %>%
    unlist()
}

###########################################################################################################

Libraries version

(data.table(library = libraries, version = librariesVersion))

includeCssPublished <- publishIncludeCss()

Session info

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ISOcodes_2021.02.24 magrittr_2.0.1      stringr_1.4.0      
## [4] DT_0.18             dplyr_1.0.6         rmarkdown_2.8      
## [7] data.table_1.14.0  
## 
## loaded via a namespace (and not attached):
##  [1] knitr_1.33        tidyselect_1.1.1  R6_2.5.0          rlang_0.4.11     
##  [5] fansi_0.4.2       highr_0.9         tools_4.0.5       xfun_0.23        
##  [9] utf8_1.2.1        jquerylib_0.1.4   htmltools_0.5.1.1 ellipsis_0.3.2   
## [13] yaml_2.2.1        digest_0.6.27     tibble_3.1.1      lifecycle_1.0.0  
## [17] crayon_1.4.1      purrr_0.3.4       htmlwidgets_1.5.3 sass_0.4.0       
## [21] vctrs_0.3.8       glue_1.4.2        evaluate_0.14     stringi_1.6.2    
## [25] compiler_4.0.5    bslib_0.2.5       pillar_1.6.0      generics_0.1.0   
## [29] jsonlite_1.7.2    pkgconfig_2.0.3

timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.core/00.02.base.functions.R completed in 0.11 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]


euCountries <- c("BE", "BG", "CZ", "DK", "DE", "EE", "IE", "EL", "ES", "FR", 
                 "HR", "IT", "CY", "LV", "LT", "LU", "HU", "MT", "NL", "AT",
                 "PL", "PT", "RO", "SI", "SK", "FI", "SE", "UK")
neuCountries <- c("ME", "MK", "AL", "RS", "TR")

getSourcePath <- function(filename, baseFolder = repository){
  paste0(baseFolder, filename)
}

load_global <- function(filename, baseFolder = repository, checkIfExists = TRUE, varName = NULL){
  obj <- sub('.*/', '', filename)
  format <- tolower(sub('.*\\.', '', obj))
  if(is.null(varName))
    varName <- sub('\\..*', '', obj)
  if(checkIfExists & exists(x = varName, envir = .GlobalEnv))return("exists in .GlobalEnv")
  if(!format %in% c("rds", "fst"))return("unknown format")
  if(format == "rds"){ 
    assign(
      varName, 
      readRDS(getSourcePath(filename, baseFolder)), 
      envir = .GlobalEnv)
  }
  if(format == "fst"){
    assign(
      varName, 
      fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = TRUE),
      envir = .GlobalEnv)
  }
}

#loadBinary <- function(filename, baseFolder = repository){
#  obj <- sub('.*/', '', filename)
#  format <- tolower(sub('.*\\.', '', obj))
#  if(!format %in% c("rds", "fst", "csv"))return("unknown format")
#  if(format == "rds")return(readRDS(getSourcePath(filename, baseFolder)))
#  if(format == "fst")return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = TRUE))
#}
#
#saveBinary <- function(data, filename = filename, baseFolder = repository){
#  obj <- sub('.*/', '', filename)
#  format <- tolower(sub('.*\\.', '', obj))
#  if(!format %in% c("rds", "fst"))return("unknown format")
#  if(format == "rds"){saveRDS(data, getSourcePath(filename, baseFolder))}
#  if(format == "fst"){fst::write_fst(data, getSourcePath(filename, baseFolder))}
#}


cleansingCorpus <- function(
  htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
  rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE, 
  tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
  rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE, fix.greek = TRUE
){
  if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
  if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
  if(rem.newline){text <- gsub("[\r\n\t]", " ", text)} 
  if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
  if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
  if(rem.space){text <- gsub("\\s+", " ", text)}  #removing excess space 
  if(tolower){text <- tolower(text)}
  if(add.space.to.numbers){    #add space between number and letters
    text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
    text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
  }
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)} 
  if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
  if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
  if(fix.greek){
    text <- gsub("ς", "σ", text)
    text <- gsub("ά", "α", text)
    text <- gsub("έ", "ε", text)
    text <- gsub("ή", "η", text)
    text <- gsub("ί", "ι", text)
    text <- gsub("ύ", "υ", text)
    text <- gsub("ό", "ο", text)
    text <- gsub("ώ", "ω", text)
  }
  trimws(text)
}

getAesXTextWrapFeatures <- function(text, wrapCharLength = 20, lineCountMax = 5){
  
  textBroken <- lapply(text, strwrap, width = wrapCharLength)
  textWrapped <- sapply(textBroken, paste, collapse = "\n")
  
  if(is.factor(text)){
    levels <- levels(text)
    levelsBroken <- lapply(levels, strwrap, width = wrapCharLength)
    levelsWrapped <- sapply(levelsBroken, paste, collapse = "\n")
    textWrapped <- factor(textWrapped, levels = levelsWrapped, ordered = TRUE)
  }
  
  textLinesCount <- sapply(textBroken, length)
  textLinesMaxNchar <- sapply(sapply(textBroken, nchar), max) %>% head(length(textWrapped)) # head fix by OSCEDVS-33
  textValuePrependLines <- sapply(textLinesCount, function(x){paste0(rep("\n", x), collapse = "")})
  textLinesNcharDiff <- textLinesMaxNchar / wrapCharLength
  textLinesCountDiff <- textLinesCount / lineCountMax
  
  data.table(
    textWrapped,
    textLinesCount,
    textLinesMaxNchar,
    textValuePrependLines,
    textLinesNcharDiff,
    textLinesCountDiff
  )
  
}

country_name <- function(vec){
  missing <- data.table(Alpha_2 = c("XK"), Name = c("Kosovo"))
  ISO_3166_1 <- rbind(ISO_3166_1[, c("Alpha_2", "Name")], missing, fill = TRUE)
  vec <- toupper(vec)
  vec <- ifelse(vec == "EL", "GR", vec)
  vec <- ifelse(vec == "EN", "GB", vec)
  vec <- ifelse(vec == "UK", "GB", vec)
  ISO_3166_1[match(vec, ISO_3166_1$Alpha_2), ]$Name %>% as.factor
} 

language_name <- function(vec){
   iso_ext <- data.table(
    Alpha_3_B = rep("", 2),
    Alpha_3_T = rep("", 2),
    Alpha_2 = c("sr-cyr", "sr-lat"),
    Name = c("Serbian Cyrillic", "Serbian Latin")
   ) %>% rbind(ISO_639_2)
   vec <- tolower(vec)
   res <- iso_ext[match(vec, iso_ext$Alpha_2), ]$Name
   gsub('[^[:alnum:]^[:space:]].*', '', res) %>% as.factor
 }

demonyms <- fread(getSourcePath("input/misc/Demonyms-List.csv", repository))
setnames(demonyms, c("ISO_3166", "Country", "Demonym_1", "Demonym_2", "Demonym_3"))
nationality_name <- function(vec){
  vec <- toupper(vec)
  vec <- ifelse(vec == "EL", "GR", vec)
  vec <- ifelse(vec == "EN", "GB", vec)
  vec <- ifelse(vec == "UK", "GB", vec)
  res <- demonyms[match(vec, ISO_3166)]$Demonym_1
  res <- gsub('[^[:alnum:]].*', '', res)
  ifelse(is.na(vec), NA, res) %>% as.factor
} 

gender_name <- function(x){
  gender_lookup <- data.table( 
    name = c("Female", "Male"),
    value = c("F", "M")
  )
  gender_lookup[match(x, gender_lookup$value), name] %>% as.factor
} 

year_groups <- function(year, firstYear = 1941, frame = 5){
  year <- year %>% as.numeric
  firstYear <- firstYear %>% as.numeric
  lastYear <- max(year[!is.na(year)]) - ((max(year[!is.na(year)]) - firstYear) %% frame)
  groupNum <- ((lastYear - firstYear) / frame) + 1
  lowerBounds <- firstYear + frame*(seq(groupNum)-1)
  upperBounds <- lowerBounds + frame - 1
  yearLevels <- lapply(seq_along(1:groupNum), function(year)paste0(lowerBounds[year],"-",upperBounds[year])) %>% unlist

  yearGroup <- rep(NA, length(year))
  for(i in seq_along(yearLevels)){
    yearGroup <- ifelse(year >= lowerBounds[i] & year <= upperBounds[i], yearLevels[i], yearGroup)
  }
  yearGroup %>% ordered(yearLevels)
}

employment_name <- function(x){
  employment_lookup <- data.table( 
    name = c("Employed", "Unemployed"),
    value = c(TRUE, FALSE)
  )
  employment_lookup[match(x, employment_lookup$value), name] %>% as.factor
}

broad_category_name <- function(x){
  broad_category_lookup <- data.table( 
    name = c("Foreign Language", "Mother Tongue", "Computer", "Job Related", "Organisational", "Communication", "Certificate", "Driving", "Unknown"),
    value = c("ForeignLanguage", "MotherTongue", "Computer", "JobRelated", "Organisational", "Communication", "Certificate", "Driving", NA)
  )
  broad_category_lookup[match(x, broad_category_lookup$value), name] %>% as.factor
}

studying_name <- function(x){
  studying_lookup <- data.table( 
    name = c("Currently studying", "No ongoing studies"),
    value = c(TRUE, FALSE)
  )
  studying_lookup[match(x, studying_lookup$value), name] %>% as.factor
}

eqf_level_name <- function(x, eqfLevelLevels = c(rep("Level 1-4", 4), "Level 5", "Level 6", "Level 7", "Level 8")){
  eqf_level_lookup <- data.table( 
    name = eqfLevelLevels,
    value = c("1", "2", "3", "4", "5", "6", "7", "8")
  )
  eqf_level_lookup[match(x, eqf_level_lookup$value), name] %>% ordered(unique(eqfLevelLevels))
}

age_groups <- function(age, ageLevels = c("15-24", "25-49", "50-64"), supremum = 65){
  splitAge <- as.numeric(c(gsub("[^[:alnum:]].*", "", ageLevels), supremum))
  if (splitAge[1] == 0) ageLevels[1] <- paste("Up to", splitAge[2] - 1)
  ageGroup <- rep(NA, length(age))
  for(i in seq_along(ageLevels)){
    ageGroup <- ifelse(age >= splitAge[i] & age < splitAge[i+1], ageLevels[i], ageGroup)
  }
  ageGroup %>% ordered(ageLevels)
}

work_experiences <- function(work_years, workYearLevels = c("1-5", "6-10", "11-25", "26-49"), supremum = 50){
  splitWorkYears <- as.numeric(c(gsub("[^[:alnum:]].*", "", workYearLevels), supremum))
  if (splitWorkYears[1] == 1) workYearLevels[1] <- paste("Up to", splitWorkYears[2] - 1)
  if (splitWorkYears[1] != 0) {
    splitWorkYears <- c(0, splitWorkYears)
    workYearLevels <- c("No experience", paste(workYearLevels, "years"))
  } else {
    workYearLevels <- paste(workYearLevels, "years")
  }

  workExpGroup <- rep(NA, length(work_years))
  for(i in seq_along(workYearLevels)){
    workExpGroup <- ifelse(work_years >= splitWorkYears[i] & work_years < splitWorkYears[i+1], workYearLevels[i], workExpGroup)
  }
  workExpGroup %>% ordered(workYearLevels)
}

job_positions <- function(num_jobs, max_jobs = 13) {
  num_jobs <- as.numeric(num_jobs)
  num_jobs[which(num_jobs > max_jobs)] <- max_jobs
  job_levels <- sort(num_jobs) %>% unique
  job_levels <- ifelse(job_levels < max_jobs, as.character(job_levels), paste0(as.character(job_levels), "+"))
  job_levels <- ifelse(job_levels == 1, paste(job_levels, "position"), paste(job_levels, "positions"))

  num_jobs <- ifelse(num_jobs < max_jobs, as.character(num_jobs), paste0(as.character(num_jobs), "+"))
  num_jobs <- ifelse(num_jobs == 1, paste(num_jobs, "position"), paste(num_jobs, "positions"))
  num_jobs %>% ordered(job_levels)
}

get_word_freqs <- function(dt, textCol = "text", groupCol = "group", stopWords = stop_words) {
  setnames(dt, c(textCol, groupCol), c("text", "group"))
  freqs <- dt %>%
    unnest_tokens(word, text) %>%
    anti_join(stopWords) %>%
    count(group, word) %>%
    complete(group, word, fill = list(n = 0)) %>%
    group_by(group) %>%
    mutate(total = sum(n), percent = n / total) %>%
    ungroup()
  setnames(dt, c("text", "group"), c(textCol, groupCol))
  freqs
}

get_word_models <- function(dt, textCol = "text", groupCol = "group", freqs = NA, min_n = 50) {
  setnames(dt, c(textCol, groupCol), c("text", "group"))
  if(is.na(freqs))freqs <- get_word_freqs(dt)
  models <- freqs %>%
    group_by(word) %>%
    filter(sum(n) > min_n) %>%
    do(tidy(glm(cbind(n, total - n) ~ group, ., family = "binomial"))) %>%
    ungroup()
  setnames(dt, c("text", "group"), c(textCol, groupCol))
  models
}

findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
  ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
    create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
    as.data.table()
  ngrams[order(-doc_count)][, .(term, count = doc_count)]
}

getStopwords <- function(locale) { 
  stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball")) 
  stopWords <- ""
  if (locale %in% stopwordsLocale) 
    stopWords <- locale %>% stopwords
  stopWords  
}

findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
  
  tokensList <- strsplit(corpus[, text], " ")
  names(tokensList) <- corpus[, code]
  
  tokensDT <- lapply(tokensList, as.data.table) %>% 
    rbindlist(idcol = TRUE) %>%
    setnames(c("class", "term"))
  
  tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
  #inverse document frequency smooth  
  idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
  idfDT[, idf :=  log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]  
  
  tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
  
  if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
  if(normalize == "log")tfDT[, tf := log(1 +term_count)]
  
  merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
  
}

igraph_from_arules <- function(x){
  itemNodes <- which(itemFrequency(items(generatingItemsets(x)), type = "absolute") > 0)
  assocNodes <- paste("assoc", 1:length(x), sep='')
  lhs <- LIST(lhs(x), decode=FALSE)
  from_lhs <- unlist(lhs)
  to_lhs <- assocNodes[rep(1:length(x), sapply(lhs, length))]
  rhs <- LIST(rhs(x), decode=FALSE)
  to_rhs <- unlist(rhs)
  from_rhs <- assocNodes[rep(1:length(x), sapply(rhs, length))]
  
  type <- c(rep(1, length(itemNodes)), rep(2, length(assocNodes)))
  nodeLabels <- c(itemLabels(x)[itemNodes], rep("", length(assocNodes)))
  
  e.list <- cbind(c(from_lhs, from_rhs), c(to_lhs, to_rhs))
  v.labels <- data.frame(
    name = c(as.character(itemNodes), assocNodes),
    label = nodeLabels,
    stringsAsFactors = FALSE)
  
  g <- igraph::graph.data.frame(e.list, directed=TRUE, vertices=v.labels)
  
  ## add quality measures
  for(m in names(quality(x))) {
    g <- igraph::set.vertex.attribute(g, m, which(type==2), 
                                      quality(x)[[m]])
  }
  return(g)
}




timeNeeded <- (proc.time()[3] -  source.starting.time);

../000.scripts/data/data_wrangle.R completed in 0.03 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Data loading

inputRepo <- getSourcePath("jobsOutput/")

Loading aggregated Qualifications n-gram frequency tables

qualifications_tf <- loadBinary("code_book/free_text/qualifications_ngrams.fst", inputRepo, format = "fst")

Type of data: data.table, data.frame.
Dimensions: 6795279, 10.
Column Names: term, eqf_granted, institution_country, enrollment_year, graduation_year, locale, country, birth_year, gender, responses.

timeNeeded <- (proc.time()[3] -  source.starting.time);

1.get.data.R completed in 3.1 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Data processing

Preparing data

Fixes.

qualifications_tf[, enrollment_year := enrollment_year %>% as.numeric]
qualifications_tf[, birth_year := birth_year %>% as.numeric]
qualifications_tf[, term := as.character(term)]
qualifications_tf[, num_words := sapply(strsplit(term, " "), length)]

qualifications_tf[, term := gsub("σ$", "ς ", term)]
qualifications_tf[, term := gsub("σ ", "ς ", term)]

Showcasing terms above a particular threshold.

qualifications_tf[, total := sum(responses), by = c("term", "locale")]
qualifications_tf <- qualifications_tf[(num_words == 1 & total > 10) | (num_words == 2 & total > 5)]

File name preparations.

qualifications_tf[, code := locale]
qualifications_tf[num_words == 1, grams := "word"]
qualifications_tf[num_words == 2, grams := "phrase"]

Preparing data for presentation.

qualifications_tf[, term := factor(term)]
qualifications_tf[, locale := language_name(locale)]
qualifications_tf[, age_group1 := age_groups(2019 - birth_year)]
qualifications_tf[, gender := gender_name(gender)]
qualifications_tf[, birth_year := ordered(birth_year)]
qualifications_tf[, eqf_granted := eqf_level_name(eqf_granted)]
qualifications_tf[, enrollment_year := ordered(enrollment_year)]

Getting qualification term frequency aggregations.

aggregate_cols <- c(
    "term", "eqf_granted", "enrollment_year", "age_group1", "gender", "birth_year", "code", "grams"
)
qualifications_tf <- qualifications_tf[, .SD, .SDcols = c(aggregate_cols, "responses")]
qualifications_tf <- qualifications_tf[, .(count = sum(responses)), by = aggregate_cols] %>% setorderv(aggregate_cols, na.last = TRUE)

keep_cols <- c(aggregate_cols[aggregate_cols != "code" & aggregate_cols != "grams"], "count")

timeNeeded <- (proc.time()[3] -  source.starting.time);

2.process.data.R completed in 91.29 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)


source.starting.time <- proc.time()[3]

Data persistance

outputRepo <- getSourcePath("jobsOutput/shiny_text_mining/qualification_terms/")

Segmenting to separate files and saving.

locales <- qualifications_tf[, code] %>% unique
fileNames <- lapply(locales, function(loc){
    grams <- qualifications_tf[code == loc, grams] %>% unique
    lapply(grams, function(key) {
        fileName <- paste0(loc, "_", key, ".fst")
        saveBinary(qualifications_tf[code == loc & grams == key, .SD, .SDcols = keep_cols], fileName, outputRepo, format = "fst")
        fileName
    }) %>% unlist
}) %>% unlist

Exposing file paths.

/data/generic/jobsOutput/shiny_text_mining/qualification_terms/cs_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/cs_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pt_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pt_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/de_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/de_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sk_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sk_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/en_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/en_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/it_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/it_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/es_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/es_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/tr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/tr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hu_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hu_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/ro_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/ro_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nl_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nl_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lv_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lv_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/el_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/el_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/bg_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/bg_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pl_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pl_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fi_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fi_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sl_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sl_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lt_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lt_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-lat_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-lat_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sv_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sv_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nb_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nb_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/et_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/et_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/da_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/da_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-cyr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-cyr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/is_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mt_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mt_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mk_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mk_phrase.fst

timeNeeded <- (proc.time()[3] -  source.starting.time);

3.save.data.R completed in 1.24 seconds

sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)

Computation metrics

source.blocks$sourceTimeNeeded <- sourceTimeNeeded;

Computational report

Completed in 96.25 seconds.

Subpart metrics

reportTabularData(source.blocks);

Term Frequencies

Qualifications

Report generated on

2021-07-05 12:45:25