Abstract
This process creates standardized unigram and bigram frequency datasets for qualifications-related text. A separate binary file is created for each locale to accommodate use with the Shiny application.# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]
Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.
librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: DT
## Loading required package: stringr
## Loading required package: magrittr
## Loading required package: ISOcodes
../000.core/00.01.libraries.R completed in 0.48 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
codeBook <- function(dataset){
out <- lapply(names(dataset), function(var_name) {
knitr::knit_expand(text = readLines("../000.core/codeBook.template"))
})
cat(
knitr::knit(
text = unlist(paste(out, collapse = '\n')),
quiet = TRUE)
)
}
fwrite_zip <- function(data, filename, quote = TRUE){
dir.create(dirname(filename), recursive = TRUE, showWarnings = FALSE)
filename_csv <- strsplit(filename, "/") %>% unlist %>% tail(1)
filename_csv <- gsub(".zip", ".csv", filename_csv)
fwrite(data, filename_csv, quote = quote)
if(file.exists(filename))unlink(filename)
zip(filename, filename_csv)
unlink(filename_csv)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data){
columnsName <- names(data)
columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
#result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
#read_xml_to_list <- function(filepath, is.gz = FALSE){
# if(is.gz){
# temp_data <- paste0(repository, "data/delete.me")
# result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
# Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
# unlink(temp_data)
# result
# }else{
# xmlToList(xmlParse(filepath))
# }
#}
#transpose_list_to_dt <- function(data_list){
# dt <- t(as.data.table(data_list))
# dt <- as.data.table(dt)
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# names(dt) <- names(data_list[[1]])
# dt
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
trimws(text)
}
cleansingEducationCorpus <- function(text) {
text <- gsub("\\.", "", text) #removing periods
text <- gsub("[[:punct:]]", " ", text) #removing other punctuation
text <- gsub("\\s+", " ", text) #removing excess space
text <- tolower(text) #changing case to lower
#removing accent from Greek
text <- gsub("ς", "σ", text)
text <- gsub("ά", "α", text)
text <- gsub("έ", "ε", text)
text <- gsub("ή", "η", text)
text <- gsub("ί", "ι", text)
text <- gsub("ύ", "υ", text)
text <- gsub("ό", "ο", text)
text <- gsub("ώ", "ω", text)
trimws(text) #trimming white-space
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
days <- "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
text <- gsub(date_form1, " ", text)
gsub(date_form2, " ", text)
}
xmlToDataTable <- function(xmlData, itemNames){
itemList <- lapply(itemNames,
function(x){
xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
}
)
names(itemList) <- xmlItems
as.data.table(itemList)
}
cleanCorpusHtml <- function(text){
unlist(lapply(text, function(x){
if(nchar(x) > 0){
# because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description
# starts with the main content
html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated
xml <- read_xml(html, as_html = TRUE)
lis <- xml_find_all(xml, ".//li")
xml_remove(lis)
text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
text <- gsub("\\s+"," ", text)
}else {""}
}))
}
#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
lim <- length(vct)
fstep <- lim%/%n_chunks
idx_list <- list()
for(i in seq(n_chunks - 1)){
idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
}
idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
return(idx_list)
}
#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
vct <- ifelse(vct > threshold, vct, Inf)
indexVec <- head(order(vct), numHead)
idVec[indexVec]
}
#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}
#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
if(length(result) > 0)
data.table(elementType = elementType, attributeValue = result)
else
data.table()
}
#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
nodes <- read_xml(paste0("<div>", feedItem, "</div>"), as_html = TRUE)
rbind(
elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
#All "img-src" are NA
)
}
#retrieve list of parameters in a http request query
getQueryParams <- function(url){
query <- httr::parse_url(url)$query
queryValues <- unlist(query)
queryNames <- names(query)
dat <- data.table(varName = queryNames, value = queryValues)
dat[queryValues != ""]
}
keepCountryName <- function(string){
string <- gsub(".*_", "", string)
gsub("\\..*", "", string)
}
keepNTokens <- function(string, num){
tokenList <- strsplit(string, split = " ")
sapply(tokenList, function(tokens){
tokens <- sort(tokens)
tokensShift <- shift(tokens, -num, fill = FALSE)
paste(tokens[tokens != tokensShift], collapse = " ")
})
}
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
tokensList <- strsplit(corpus[, text], " ")
names(tokensList) <- corpus[, code]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
}
tidyJsonData <- function(jsonList){
if(length(jsonList) == 0)return(NULL)
unlistOccupations <- jsonList %>% unlist
codesMaleFemale <- names(unlistOccupations)
epasMapping <- data.table(unlistOccupations)
epasMapping[ , code := gsub("\\.[[:alpha:]]$", "", codesMaleFemale)]
uniqueMappingBoolean <- epasMapping[ , unlistOccupations != c(unlistOccupations[-1], F), by = code]$V1
codesEpasDB <- epasMapping[uniqueMappingBoolean]
names(codesEpasDB) <- c("title", "code")
codesEpasDB[ , title := cleansingCorpus(title)]
}
findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
as.data.table()
ngrams[order(-doc_count)][, .(term, count = doc_count)]
}
`%W>%` <- function(lhs,rhs){
w <- options()$warn
on.exit(options(warn=w))
options(warn=-1)
eval.parent(substitute(lhs %>% rhs))
}
getStopwords <- function(locale) {
stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball"))
stopWords <- ""
if (locale %in% stopwordsLocale)
stopWords <- locale %W>% stopwords
stopWords
}
###################################################################################################
# Text translation
###################################################################################################
translateText <- function(sourceText, sourceLang, translationLang, batchSize = 4800) {
if (length(sourceText) == 0) {
return ("")
} else if (length(sourceText) == 1) {
return (requestTranslation(sourceText, sourceLang, translationLang))
} else if (length(sourceText) > 4800) {
return (NA)
}
sourceQueries <- gsub("$", "\n >", sourceText)
sourceQueries <- gsub("^", "< \n", sourceQueries)
queries <- data.table(query = sourceQueries, size = nchar(sourceQueries), batch = nchar(sourceQueries))
for (row in seq(nrow(queries) - 1)) {
cumulativeSum <- queries[row, batch] + queries[row + 1, batch] + 3
queries[row + 1, batch := ifelse(cumulativeSum > batchSize, batch, cumulativeSum)]
}
batchStarts <- which(queries[, size] == queries[, batch])
batchFins <- c(batchStarts[-1] - 1, nrow(queries))
batches <- lapply(seq_along(batchStarts), function(i) batchStarts[i]:batchFins[i])
pastedQueries <- lapply(batches, function(batch) paste0(queries[batch, query], collapse = "\n")) %>% unlist()
translatedText <- lapply(pastedQueries, requestTranslation, sourceLang, translationLang) %>%
unlist() %>%
paste0(collapse = " ")
translatedText <- gsub("\\s?<", "", translatedText)
gsub(">$", "", translatedText) %>%
space_tokenizer(sep = ">") %>%
unlist() %>%
trimws()
}
requestTranslation <- function(sourceText, sourceLang, translationLang) {
googleTranslateURL <- paste0(
"https://translate.google.com/m",
"?hl=", sourceLang,
"&sl=", sourceLang,
"&tl=", translationLang,
"&ie=UTF-8&prev=_m&q=", URLencode(sourceText, reserved = TRUE)
)
GET(googleTranslateURL, add_headers("user-agent" = "Mozilla/5.0")) %>%
read_html() %>%
xml_child(2) %>%
xml_child(5) %>%
xml_text() %>%
unlist()
}
###########################################################################################################
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ISOcodes_2021.02.24 magrittr_2.0.1 stringr_1.4.0
## [4] DT_0.18 dplyr_1.0.6 rmarkdown_2.8
## [7] data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] knitr_1.33 tidyselect_1.1.1 R6_2.5.0 rlang_0.4.11
## [5] fansi_0.4.2 highr_0.9 tools_4.0.5 xfun_0.23
## [9] utf8_1.2.1 jquerylib_0.1.4 htmltools_0.5.1.1 ellipsis_0.3.2
## [13] yaml_2.2.1 digest_0.6.27 tibble_3.1.1 lifecycle_1.0.0
## [17] crayon_1.4.1 purrr_0.3.4 htmlwidgets_1.5.3 sass_0.4.0
## [21] vctrs_0.3.8 glue_1.4.2 evaluate_0.14 stringi_1.6.2
## [25] compiler_4.0.5 bslib_0.2.5 pillar_1.6.0 generics_0.1.0
## [29] jsonlite_1.7.2 pkgconfig_2.0.3
../000.core/00.02.base.functions.R completed in 0.11 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
euCountries <- c("BE", "BG", "CZ", "DK", "DE", "EE", "IE", "EL", "ES", "FR",
"HR", "IT", "CY", "LV", "LT", "LU", "HU", "MT", "NL", "AT",
"PL", "PT", "RO", "SI", "SK", "FI", "SE", "UK")
neuCountries <- c("ME", "MK", "AL", "RS", "TR")
getSourcePath <- function(filename, baseFolder = repository){
paste0(baseFolder, filename)
}
load_global <- function(filename, baseFolder = repository, checkIfExists = TRUE, varName = NULL){
obj <- sub('.*/', '', filename)
format <- tolower(sub('.*\\.', '', obj))
if(is.null(varName))
varName <- sub('\\..*', '', obj)
if(checkIfExists & exists(x = varName, envir = .GlobalEnv))return("exists in .GlobalEnv")
if(!format %in% c("rds", "fst"))return("unknown format")
if(format == "rds"){
assign(
varName,
readRDS(getSourcePath(filename, baseFolder)),
envir = .GlobalEnv)
}
if(format == "fst"){
assign(
varName,
fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = TRUE),
envir = .GlobalEnv)
}
}
#loadBinary <- function(filename, baseFolder = repository){
# obj <- sub('.*/', '', filename)
# format <- tolower(sub('.*\\.', '', obj))
# if(!format %in% c("rds", "fst", "csv"))return("unknown format")
# if(format == "rds")return(readRDS(getSourcePath(filename, baseFolder)))
# if(format == "fst")return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = TRUE))
#}
#
#saveBinary <- function(data, filename = filename, baseFolder = repository){
# obj <- sub('.*/', '', filename)
# format <- tolower(sub('.*\\.', '', obj))
# if(!format %in% c("rds", "fst"))return("unknown format")
# if(format == "rds"){saveRDS(data, getSourcePath(filename, baseFolder))}
# if(format == "fst"){fst::write_fst(data, getSourcePath(filename, baseFolder))}
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE, fix.greek = TRUE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(fix.greek){
text <- gsub("ς", "σ", text)
text <- gsub("ά", "α", text)
text <- gsub("έ", "ε", text)
text <- gsub("ή", "η", text)
text <- gsub("ί", "ι", text)
text <- gsub("ύ", "υ", text)
text <- gsub("ό", "ο", text)
text <- gsub("ώ", "ω", text)
}
trimws(text)
}
getAesXTextWrapFeatures <- function(text, wrapCharLength = 20, lineCountMax = 5){
textBroken <- lapply(text, strwrap, width = wrapCharLength)
textWrapped <- sapply(textBroken, paste, collapse = "\n")
if(is.factor(text)){
levels <- levels(text)
levelsBroken <- lapply(levels, strwrap, width = wrapCharLength)
levelsWrapped <- sapply(levelsBroken, paste, collapse = "\n")
textWrapped <- factor(textWrapped, levels = levelsWrapped, ordered = TRUE)
}
textLinesCount <- sapply(textBroken, length)
textLinesMaxNchar <- sapply(sapply(textBroken, nchar), max) %>% head(length(textWrapped)) # head fix by OSCEDVS-33
textValuePrependLines <- sapply(textLinesCount, function(x){paste0(rep("\n", x), collapse = "")})
textLinesNcharDiff <- textLinesMaxNchar / wrapCharLength
textLinesCountDiff <- textLinesCount / lineCountMax
data.table(
textWrapped,
textLinesCount,
textLinesMaxNchar,
textValuePrependLines,
textLinesNcharDiff,
textLinesCountDiff
)
}
country_name <- function(vec){
missing <- data.table(Alpha_2 = c("XK"), Name = c("Kosovo"))
ISO_3166_1 <- rbind(ISO_3166_1[, c("Alpha_2", "Name")], missing, fill = TRUE)
vec <- toupper(vec)
vec <- ifelse(vec == "EL", "GR", vec)
vec <- ifelse(vec == "EN", "GB", vec)
vec <- ifelse(vec == "UK", "GB", vec)
ISO_3166_1[match(vec, ISO_3166_1$Alpha_2), ]$Name %>% as.factor
}
language_name <- function(vec){
iso_ext <- data.table(
Alpha_3_B = rep("", 2),
Alpha_3_T = rep("", 2),
Alpha_2 = c("sr-cyr", "sr-lat"),
Name = c("Serbian Cyrillic", "Serbian Latin")
) %>% rbind(ISO_639_2)
vec <- tolower(vec)
res <- iso_ext[match(vec, iso_ext$Alpha_2), ]$Name
gsub('[^[:alnum:]^[:space:]].*', '', res) %>% as.factor
}
demonyms <- fread(getSourcePath("input/misc/Demonyms-List.csv", repository))
setnames(demonyms, c("ISO_3166", "Country", "Demonym_1", "Demonym_2", "Demonym_3"))
nationality_name <- function(vec){
vec <- toupper(vec)
vec <- ifelse(vec == "EL", "GR", vec)
vec <- ifelse(vec == "EN", "GB", vec)
vec <- ifelse(vec == "UK", "GB", vec)
res <- demonyms[match(vec, ISO_3166)]$Demonym_1
res <- gsub('[^[:alnum:]].*', '', res)
ifelse(is.na(vec), NA, res) %>% as.factor
}
gender_name <- function(x){
gender_lookup <- data.table(
name = c("Female", "Male"),
value = c("F", "M")
)
gender_lookup[match(x, gender_lookup$value), name] %>% as.factor
}
year_groups <- function(year, firstYear = 1941, frame = 5){
year <- year %>% as.numeric
firstYear <- firstYear %>% as.numeric
lastYear <- max(year[!is.na(year)]) - ((max(year[!is.na(year)]) - firstYear) %% frame)
groupNum <- ((lastYear - firstYear) / frame) + 1
lowerBounds <- firstYear + frame*(seq(groupNum)-1)
upperBounds <- lowerBounds + frame - 1
yearLevels <- lapply(seq_along(1:groupNum), function(year)paste0(lowerBounds[year],"-",upperBounds[year])) %>% unlist
yearGroup <- rep(NA, length(year))
for(i in seq_along(yearLevels)){
yearGroup <- ifelse(year >= lowerBounds[i] & year <= upperBounds[i], yearLevels[i], yearGroup)
}
yearGroup %>% ordered(yearLevels)
}
employment_name <- function(x){
employment_lookup <- data.table(
name = c("Employed", "Unemployed"),
value = c(TRUE, FALSE)
)
employment_lookup[match(x, employment_lookup$value), name] %>% as.factor
}
broad_category_name <- function(x){
broad_category_lookup <- data.table(
name = c("Foreign Language", "Mother Tongue", "Computer", "Job Related", "Organisational", "Communication", "Certificate", "Driving", "Unknown"),
value = c("ForeignLanguage", "MotherTongue", "Computer", "JobRelated", "Organisational", "Communication", "Certificate", "Driving", NA)
)
broad_category_lookup[match(x, broad_category_lookup$value), name] %>% as.factor
}
studying_name <- function(x){
studying_lookup <- data.table(
name = c("Currently studying", "No ongoing studies"),
value = c(TRUE, FALSE)
)
studying_lookup[match(x, studying_lookup$value), name] %>% as.factor
}
eqf_level_name <- function(x, eqfLevelLevels = c(rep("Level 1-4", 4), "Level 5", "Level 6", "Level 7", "Level 8")){
eqf_level_lookup <- data.table(
name = eqfLevelLevels,
value = c("1", "2", "3", "4", "5", "6", "7", "8")
)
eqf_level_lookup[match(x, eqf_level_lookup$value), name] %>% ordered(unique(eqfLevelLevels))
}
age_groups <- function(age, ageLevels = c("15-24", "25-49", "50-64"), supremum = 65){
splitAge <- as.numeric(c(gsub("[^[:alnum:]].*", "", ageLevels), supremum))
if (splitAge[1] == 0) ageLevels[1] <- paste("Up to", splitAge[2] - 1)
ageGroup <- rep(NA, length(age))
for(i in seq_along(ageLevels)){
ageGroup <- ifelse(age >= splitAge[i] & age < splitAge[i+1], ageLevels[i], ageGroup)
}
ageGroup %>% ordered(ageLevels)
}
work_experiences <- function(work_years, workYearLevels = c("1-5", "6-10", "11-25", "26-49"), supremum = 50){
splitWorkYears <- as.numeric(c(gsub("[^[:alnum:]].*", "", workYearLevels), supremum))
if (splitWorkYears[1] == 1) workYearLevels[1] <- paste("Up to", splitWorkYears[2] - 1)
if (splitWorkYears[1] != 0) {
splitWorkYears <- c(0, splitWorkYears)
workYearLevels <- c("No experience", paste(workYearLevels, "years"))
} else {
workYearLevels <- paste(workYearLevels, "years")
}
workExpGroup <- rep(NA, length(work_years))
for(i in seq_along(workYearLevels)){
workExpGroup <- ifelse(work_years >= splitWorkYears[i] & work_years < splitWorkYears[i+1], workYearLevels[i], workExpGroup)
}
workExpGroup %>% ordered(workYearLevels)
}
job_positions <- function(num_jobs, max_jobs = 13) {
num_jobs <- as.numeric(num_jobs)
num_jobs[which(num_jobs > max_jobs)] <- max_jobs
job_levels <- sort(num_jobs) %>% unique
job_levels <- ifelse(job_levels < max_jobs, as.character(job_levels), paste0(as.character(job_levels), "+"))
job_levels <- ifelse(job_levels == 1, paste(job_levels, "position"), paste(job_levels, "positions"))
num_jobs <- ifelse(num_jobs < max_jobs, as.character(num_jobs), paste0(as.character(num_jobs), "+"))
num_jobs <- ifelse(num_jobs == 1, paste(num_jobs, "position"), paste(num_jobs, "positions"))
num_jobs %>% ordered(job_levels)
}
get_word_freqs <- function(dt, textCol = "text", groupCol = "group", stopWords = stop_words) {
setnames(dt, c(textCol, groupCol), c("text", "group"))
freqs <- dt %>%
unnest_tokens(word, text) %>%
anti_join(stopWords) %>%
count(group, word) %>%
complete(group, word, fill = list(n = 0)) %>%
group_by(group) %>%
mutate(total = sum(n), percent = n / total) %>%
ungroup()
setnames(dt, c("text", "group"), c(textCol, groupCol))
freqs
}
get_word_models <- function(dt, textCol = "text", groupCol = "group", freqs = NA, min_n = 50) {
setnames(dt, c(textCol, groupCol), c("text", "group"))
if(is.na(freqs))freqs <- get_word_freqs(dt)
models <- freqs %>%
group_by(word) %>%
filter(sum(n) > min_n) %>%
do(tidy(glm(cbind(n, total - n) ~ group, ., family = "binomial"))) %>%
ungroup()
setnames(dt, c("text", "group"), c(textCol, groupCol))
models
}
findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
as.data.table()
ngrams[order(-doc_count)][, .(term, count = doc_count)]
}
getStopwords <- function(locale) {
stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball"))
stopWords <- ""
if (locale %in% stopwordsLocale)
stopWords <- locale %>% stopwords
stopWords
}
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
tokensList <- strsplit(corpus[, text], " ")
names(tokensList) <- corpus[, code]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
}
igraph_from_arules <- function(x){
itemNodes <- which(itemFrequency(items(generatingItemsets(x)), type = "absolute") > 0)
assocNodes <- paste("assoc", 1:length(x), sep='')
lhs <- LIST(lhs(x), decode=FALSE)
from_lhs <- unlist(lhs)
to_lhs <- assocNodes[rep(1:length(x), sapply(lhs, length))]
rhs <- LIST(rhs(x), decode=FALSE)
to_rhs <- unlist(rhs)
from_rhs <- assocNodes[rep(1:length(x), sapply(rhs, length))]
type <- c(rep(1, length(itemNodes)), rep(2, length(assocNodes)))
nodeLabels <- c(itemLabels(x)[itemNodes], rep("", length(assocNodes)))
e.list <- cbind(c(from_lhs, from_rhs), c(to_lhs, to_rhs))
v.labels <- data.frame(
name = c(as.character(itemNodes), assocNodes),
label = nodeLabels,
stringsAsFactors = FALSE)
g <- igraph::graph.data.frame(e.list, directed=TRUE, vertices=v.labels)
## add quality measures
for(m in names(quality(x))) {
g <- igraph::set.vertex.attribute(g, m, which(type==2),
quality(x)[[m]])
}
return(g)
}
timeNeeded <- (proc.time()[3] - source.starting.time);
../000.scripts/data/data_wrangle.R completed in 0.03 seconds
Type of data: data.table, data.frame.
Dimensions: 6795279, 10.
Column Names: term, eqf_granted, institution_country, enrollment_year, graduation_year, locale, country, birth_year, gender, responses.
1.get.data.R completed in 3.1 seconds
qualifications_tf[, enrollment_year := enrollment_year %>% as.numeric]
qualifications_tf[, birth_year := birth_year %>% as.numeric]
qualifications_tf[, term := as.character(term)]
qualifications_tf[, num_words := sapply(strsplit(term, " "), length)]
qualifications_tf[, term := gsub("σ$", "ς ", term)]
qualifications_tf[, term := gsub("σ ", "ς ", term)]
qualifications_tf[, total := sum(responses), by = c("term", "locale")]
qualifications_tf <- qualifications_tf[(num_words == 1 & total > 10) | (num_words == 2 & total > 5)]
qualifications_tf[, code := locale]
qualifications_tf[num_words == 1, grams := "word"]
qualifications_tf[num_words == 2, grams := "phrase"]
qualifications_tf[, term := factor(term)]
qualifications_tf[, locale := language_name(locale)]
qualifications_tf[, age_group1 := age_groups(2019 - birth_year)]
qualifications_tf[, gender := gender_name(gender)]
qualifications_tf[, birth_year := ordered(birth_year)]
qualifications_tf[, eqf_granted := eqf_level_name(eqf_granted)]
qualifications_tf[, enrollment_year := ordered(enrollment_year)]
aggregate_cols <- c(
"term", "eqf_granted", "enrollment_year", "age_group1", "gender", "birth_year", "code", "grams"
)
qualifications_tf <- qualifications_tf[, .SD, .SDcols = c(aggregate_cols, "responses")]
qualifications_tf <- qualifications_tf[, .(count = sum(responses)), by = aggregate_cols] %>% setorderv(aggregate_cols, na.last = TRUE)
keep_cols <- c(aggregate_cols[aggregate_cols != "code" & aggregate_cols != "grams"], "count")
2.process.data.R completed in 91.29 seconds
locales <- qualifications_tf[, code] %>% unique
fileNames <- lapply(locales, function(loc){
grams <- qualifications_tf[code == loc, grams] %>% unique
lapply(grams, function(key) {
fileName <- paste0(loc, "_", key, ".fst")
saveBinary(qualifications_tf[code == loc & grams == key, .SD, .SDcols = keep_cols], fileName, outputRepo, format = "fst")
fileName
}) %>% unlist
}) %>% unlist
/data/generic/jobsOutput/shiny_text_mining/qualification_terms/cs_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/cs_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pt_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pt_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/de_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/de_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sk_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sk_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/en_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/en_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/it_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/it_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/es_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/es_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/tr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/tr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hu_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hu_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/ro_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/ro_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nl_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nl_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lv_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lv_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/el_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/el_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/bg_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/bg_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pl_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/pl_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/hr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fi_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/fi_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sl_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sl_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lt_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/lt_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-lat_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-lat_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sv_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sv_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nb_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/nb_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/et_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/et_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/da_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/da_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-cyr_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/sr-cyr_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/is_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mt_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mt_phrase.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mk_word.fst, /data/generic/jobsOutput/shiny_text_mining/qualification_terms/mk_phrase.fst
3.save.data.R completed in 1.24 seconds
Completed in 96.25 seconds.