Abstract
This process creates an indicator dataset for education fields# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]
Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.
librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: DT
## Loading required package: stringr
## Loading required package: magrittr
## Loading required package: ISOcodes
../000.core/00.01.libraries.R completed in 0.47 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# ESCO skills
# @authors ds@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
codeBook <- function(dataset){
out <- lapply(names(dataset), function(var_name) {
knitr::knit_expand(text = readLines("../000.core/codeBook.template"))
})
cat(
knitr::knit(
text = unlist(paste(out, collapse = '\n')),
quiet = TRUE)
)
}
fwrite_zip <- function(data, filename, quote = TRUE){
dir.create(dirname(filename), recursive = TRUE, showWarnings = FALSE)
filename_csv <- strsplit(filename, "/") %>% unlist %>% tail(1)
filename_csv <- gsub(".zip", ".csv", filename_csv)
fwrite(data, filename_csv, quote = quote)
if(file.exists(filename))unlink(filename)
zip(filename, filename_csv)
unlink(filename_csv)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data){
columnsName <- names(data)
columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
#result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
#read_xml_to_list <- function(filepath, is.gz = FALSE){
# if(is.gz){
# temp_data <- paste0(repository, "data/delete.me")
# result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
# Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
# unlink(temp_data)
# result
# }else{
# xmlToList(xmlParse(filepath))
# }
#}
#transpose_list_to_dt <- function(data_list){
# dt <- t(as.data.table(data_list))
# dt <- as.data.table(dt)
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# names(dt) <- names(data_list[[1]])
# dt
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
trimws(text)
}
cleansingEducationCorpus <- function(text) {
text <- gsub("\\.", "", text) #removing periods
text <- gsub("[[:punct:]]", " ", text) #removing other punctuation
text <- gsub("\\s+", " ", text) #removing excess space
text <- tolower(text) #changing case to lower
#removing accent from Greek
text <- gsub("ς", "σ", text)
text <- gsub("ά", "α", text)
text <- gsub("έ", "ε", text)
text <- gsub("ή", "η", text)
text <- gsub("ί", "ι", text)
text <- gsub("ύ", "υ", text)
text <- gsub("ό", "ο", text)
text <- gsub("ώ", "ω", text)
trimws(text) #trimming white-space
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
days <- "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
text <- gsub(date_form1, " ", text)
gsub(date_form2, " ", text)
}
xmlToDataTable <- function(xmlData, itemNames){
itemList <- lapply(itemNames,
function(x){
xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
}
)
names(itemList) <- xmlItems
as.data.table(itemList)
}
cleanCorpusHtml <- function(text){
unlist(lapply(text, function(x){
if(nchar(x) > 0){
# because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description
# starts with the main content
html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated
xml <- read_xml(html, as_html = TRUE)
lis <- xml_find_all(xml, ".//li")
xml_remove(lis)
text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
text <- gsub("\\s+"," ", text)
}else {""}
}))
}
#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
lim <- length(vct)
fstep <- lim%/%n_chunks
idx_list <- list()
for(i in seq(n_chunks - 1)){
idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
}
idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
return(idx_list)
}
#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
vct <- ifelse(vct > threshold, vct, Inf)
indexVec <- head(order(vct), numHead)
idVec[indexVec]
}
#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}
#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
if(length(result) > 0)
data.table(elementType = elementType, attributeValue = result)
else
data.table()
}
#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
nodes <- read_xml(paste0("<div>", feedItem, "</div>"), as_html = TRUE)
rbind(
elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
#All "img-src" are NA
)
}
#retrieve list of parameters in a http request query
getQueryParams <- function(url){
query <- httr::parse_url(url)$query
queryValues <- unlist(query)
queryNames <- names(query)
dat <- data.table(varName = queryNames, value = queryValues)
dat[queryValues != ""]
}
keepCountryName <- function(string){
string <- gsub(".*_", "", string)
gsub("\\..*", "", string)
}
keepNTokens <- function(string, num){
tokenList <- strsplit(string, split = " ")
sapply(tokenList, function(tokens){
tokens <- sort(tokens)
tokensShift <- shift(tokens, -num, fill = FALSE)
paste(tokens[tokens != tokensShift], collapse = " ")
})
}
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
tokensList <- strsplit(corpus[, text], " ")
names(tokensList) <- corpus[, code]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
}
tidyJsonData <- function(jsonList){
if(length(jsonList) == 0)return(NULL)
unlistOccupations <- jsonList %>% unlist
codesMaleFemale <- names(unlistOccupations)
epasMapping <- data.table(unlistOccupations)
epasMapping[ , code := gsub("\\.[[:alpha:]]$", "", codesMaleFemale)]
uniqueMappingBoolean <- epasMapping[ , unlistOccupations != c(unlistOccupations[-1], F), by = code]$V1
codesEpasDB <- epasMapping[uniqueMappingBoolean]
names(codesEpasDB) <- c("title", "code")
codesEpasDB[ , title := cleansingCorpus(title)]
}
findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
as.data.table()
ngrams[order(-doc_count)][, .(term, count = doc_count)]
}
`%W>%` <- function(lhs,rhs){
w <- options()$warn
on.exit(options(warn=w))
options(warn=-1)
eval.parent(substitute(lhs %>% rhs))
}
getStopwords <- function(locale) {
stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball"))
stopWords <- ""
if (locale %in% stopwordsLocale)
stopWords <- locale %W>% stopwords
stopWords
}
###################################################################################################
# Text translation
###################################################################################################
translateText <- function(sourceText, sourceLang, translationLang, batchSize = 4800) {
if (length(sourceText) == 0) {
return ("")
} else if (length(sourceText) == 1) {
return (requestTranslation(sourceText, sourceLang, translationLang))
} else if (length(sourceText) > 4800) {
return (NA)
}
sourceQueries <- gsub("$", "\n >", sourceText)
sourceQueries <- gsub("^", "< \n", sourceQueries)
queries <- data.table(query = sourceQueries, size = nchar(sourceQueries), batch = nchar(sourceQueries))
for (row in seq(nrow(queries) - 1)) {
cumulativeSum <- queries[row, batch] + queries[row + 1, batch] + 3
queries[row + 1, batch := ifelse(cumulativeSum > batchSize, batch, cumulativeSum)]
}
batchStarts <- which(queries[, size] == queries[, batch])
batchFins <- c(batchStarts[-1] - 1, nrow(queries))
batches <- lapply(seq_along(batchStarts), function(i) batchStarts[i]:batchFins[i])
pastedQueries <- lapply(batches, function(batch) paste0(queries[batch, query], collapse = "\n")) %>% unlist()
translatedText <- lapply(pastedQueries, requestTranslation, sourceLang, translationLang) %>%
unlist() %>%
paste0(collapse = " ")
translatedText <- gsub("\\s?<", "", translatedText)
gsub(">$", "", translatedText) %>%
space_tokenizer(sep = ">") %>%
unlist() %>%
trimws()
}
requestTranslation <- function(sourceText, sourceLang, translationLang) {
googleTranslateURL <- paste0(
"https://translate.google.com/m",
"?hl=", sourceLang,
"&sl=", sourceLang,
"&tl=", translationLang,
"&ie=UTF-8&prev=_m&q=", URLencode(sourceText, reserved = TRUE)
)
GET(googleTranslateURL, add_headers("user-agent" = "Mozilla/5.0")) %>%
read_html() %>%
xml_child(2) %>%
xml_child(5) %>%
xml_text() %>%
unlist()
}
###########################################################################################################
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ISOcodes_2021.02.24 magrittr_2.0.1 stringr_1.4.0
## [4] DT_0.18 dplyr_1.0.6 rmarkdown_2.8
## [7] data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] knitr_1.33 tidyselect_1.1.1 R6_2.5.0 rlang_0.4.11
## [5] fansi_0.4.2 highr_0.9 tools_4.0.5 xfun_0.23
## [9] utf8_1.2.1 jquerylib_0.1.4 htmltools_0.5.1.1 ellipsis_0.3.2
## [13] yaml_2.2.1 digest_0.6.27 tibble_3.1.1 lifecycle_1.0.0
## [17] crayon_1.4.1 purrr_0.3.4 htmlwidgets_1.5.3 sass_0.4.0
## [21] vctrs_0.3.8 glue_1.4.2 evaluate_0.14 stringi_1.6.2
## [25] compiler_4.0.5 bslib_0.2.5 pillar_1.6.0 generics_0.1.0
## [29] jsonlite_1.7.2 pkgconfig_2.0.3
../000.core/00.02.base.functions.R completed in 0.1 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
euCountries <- c("BE", "BG", "CZ", "DK", "DE", "EE", "IE", "EL", "ES", "FR",
"HR", "IT", "CY", "LV", "LT", "LU", "HU", "MT", "NL", "AT",
"PL", "PT", "RO", "SI", "SK", "FI", "SE", "UK")
neuCountries <- c("ME", "MK", "AL", "RS", "TR")
getSourcePath <- function(filename, baseFolder = repository){
paste0(baseFolder, filename)
}
load_global <- function(filename, baseFolder = repository, checkIfExists = TRUE, varName = NULL){
obj <- sub('.*/', '', filename)
format <- tolower(sub('.*\\.', '', obj))
if(is.null(varName))
varName <- sub('\\..*', '', obj)
if(checkIfExists & exists(x = varName, envir = .GlobalEnv))return("exists in .GlobalEnv")
if(!format %in% c("rds", "fst"))return("unknown format")
if(format == "rds"){
assign(
varName,
readRDS(getSourcePath(filename, baseFolder)),
envir = .GlobalEnv)
}
if(format == "fst"){
assign(
varName,
fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = TRUE),
envir = .GlobalEnv)
}
}
#loadBinary <- function(filename, baseFolder = repository){
# obj <- sub('.*/', '', filename)
# format <- tolower(sub('.*\\.', '', obj))
# if(!format %in% c("rds", "fst", "csv"))return("unknown format")
# if(format == "rds")return(readRDS(getSourcePath(filename, baseFolder)))
# if(format == "fst")return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = TRUE))
#}
#
#saveBinary <- function(data, filename = filename, baseFolder = repository){
# obj <- sub('.*/', '', filename)
# format <- tolower(sub('.*\\.', '', obj))
# if(!format %in% c("rds", "fst"))return("unknown format")
# if(format == "rds"){saveRDS(data, getSourcePath(filename, baseFolder))}
# if(format == "fst"){fst::write_fst(data, getSourcePath(filename, baseFolder))}
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE, fix.greek = TRUE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(fix.greek){
text <- gsub("ς", "σ", text)
text <- gsub("ά", "α", text)
text <- gsub("έ", "ε", text)
text <- gsub("ή", "η", text)
text <- gsub("ί", "ι", text)
text <- gsub("ύ", "υ", text)
text <- gsub("ό", "ο", text)
text <- gsub("ώ", "ω", text)
}
trimws(text)
}
getAesXTextWrapFeatures <- function(text, wrapCharLength = 20, lineCountMax = 5){
textBroken <- lapply(text, strwrap, width = wrapCharLength)
textWrapped <- sapply(textBroken, paste, collapse = "\n")
if(is.factor(text)){
levels <- levels(text)
levelsBroken <- lapply(levels, strwrap, width = wrapCharLength)
levelsWrapped <- sapply(levelsBroken, paste, collapse = "\n")
textWrapped <- factor(textWrapped, levels = levelsWrapped, ordered = TRUE)
}
textLinesCount <- sapply(textBroken, length)
textLinesMaxNchar <- sapply(sapply(textBroken, nchar), max) %>% head(length(textWrapped)) # head fix by OSCEDVS-33
textValuePrependLines <- sapply(textLinesCount, function(x){paste0(rep("\n", x), collapse = "")})
textLinesNcharDiff <- textLinesMaxNchar / wrapCharLength
textLinesCountDiff <- textLinesCount / lineCountMax
data.table(
textWrapped,
textLinesCount,
textLinesMaxNchar,
textValuePrependLines,
textLinesNcharDiff,
textLinesCountDiff
)
}
country_name <- function(vec){
missing <- data.table(Alpha_2 = c("XK"), Name = c("Kosovo"))
ISO_3166_1 <- rbind(ISO_3166_1[, c("Alpha_2", "Name")], missing, fill = TRUE)
vec <- toupper(vec)
vec <- ifelse(vec == "EL", "GR", vec)
vec <- ifelse(vec == "EN", "GB", vec)
vec <- ifelse(vec == "UK", "GB", vec)
ISO_3166_1[match(vec, ISO_3166_1$Alpha_2), ]$Name %>% as.factor
}
language_name <- function(vec){
iso_ext <- data.table(
Alpha_3_B = rep("", 2),
Alpha_3_T = rep("", 2),
Alpha_2 = c("sr-cyr", "sr-lat"),
Name = c("Serbian Cyrillic", "Serbian Latin")
) %>% rbind(ISO_639_2)
vec <- tolower(vec)
res <- iso_ext[match(vec, iso_ext$Alpha_2), ]$Name
gsub('[^[:alnum:]^[:space:]].*', '', res) %>% as.factor
}
demonyms <- fread(getSourcePath("input/misc/Demonyms-List.csv", repository))
setnames(demonyms, c("ISO_3166", "Country", "Demonym_1", "Demonym_2", "Demonym_3"))
nationality_name <- function(vec){
vec <- toupper(vec)
vec <- ifelse(vec == "EL", "GR", vec)
vec <- ifelse(vec == "EN", "GB", vec)
vec <- ifelse(vec == "UK", "GB", vec)
res <- demonyms[match(vec, ISO_3166)]$Demonym_1
res <- gsub('[^[:alnum:]].*', '', res)
ifelse(is.na(vec), NA, res) %>% as.factor
}
gender_name <- function(x){
gender_lookup <- data.table(
name = c("Female", "Male"),
value = c("F", "M")
)
gender_lookup[match(x, gender_lookup$value), name] %>% as.factor
}
year_groups <- function(year, firstYear = 1941, frame = 5){
year <- year %>% as.numeric
firstYear <- firstYear %>% as.numeric
lastYear <- max(year[!is.na(year)]) - ((max(year[!is.na(year)]) - firstYear) %% frame)
groupNum <- ((lastYear - firstYear) / frame) + 1
lowerBounds <- firstYear + frame*(seq(groupNum)-1)
upperBounds <- lowerBounds + frame - 1
yearLevels <- lapply(seq_along(1:groupNum), function(year)paste0(lowerBounds[year],"-",upperBounds[year])) %>% unlist
yearGroup <- rep(NA, length(year))
for(i in seq_along(yearLevels)){
yearGroup <- ifelse(year >= lowerBounds[i] & year <= upperBounds[i], yearLevels[i], yearGroup)
}
yearGroup %>% ordered(yearLevels)
}
employment_name <- function(x){
employment_lookup <- data.table(
name = c("Employed", "Unemployed"),
value = c(TRUE, FALSE)
)
employment_lookup[match(x, employment_lookup$value), name] %>% as.factor
}
broad_category_name <- function(x){
broad_category_lookup <- data.table(
name = c("Foreign Language", "Mother Tongue", "Computer", "Job Related", "Organisational", "Communication", "Certificate", "Driving", "Unknown"),
value = c("ForeignLanguage", "MotherTongue", "Computer", "JobRelated", "Organisational", "Communication", "Certificate", "Driving", NA)
)
broad_category_lookup[match(x, broad_category_lookup$value), name] %>% as.factor
}
studying_name <- function(x){
studying_lookup <- data.table(
name = c("Currently studying", "No ongoing studies"),
value = c(TRUE, FALSE)
)
studying_lookup[match(x, studying_lookup$value), name] %>% as.factor
}
eqf_level_name <- function(x, eqfLevelLevels = c(rep("Level 1-4", 4), "Level 5", "Level 6", "Level 7", "Level 8")){
eqf_level_lookup <- data.table(
name = eqfLevelLevels,
value = c("1", "2", "3", "4", "5", "6", "7", "8")
)
eqf_level_lookup[match(x, eqf_level_lookup$value), name] %>% ordered(unique(eqfLevelLevels))
}
age_groups <- function(age, ageLevels = c("15-24", "25-49", "50-64"), supremum = 65){
splitAge <- as.numeric(c(gsub("[^[:alnum:]].*", "", ageLevels), supremum))
if (splitAge[1] == 0) ageLevels[1] <- paste("Up to", splitAge[2] - 1)
ageGroup <- rep(NA, length(age))
for(i in seq_along(ageLevels)){
ageGroup <- ifelse(age >= splitAge[i] & age < splitAge[i+1], ageLevels[i], ageGroup)
}
ageGroup %>% ordered(ageLevels)
}
work_experiences <- function(work_years, workYearLevels = c("1-5", "6-10", "11-25", "26-49"), supremum = 50){
splitWorkYears <- as.numeric(c(gsub("[^[:alnum:]].*", "", workYearLevels), supremum))
if (splitWorkYears[1] == 1) workYearLevels[1] <- paste("Up to", splitWorkYears[2] - 1)
if (splitWorkYears[1] != 0) {
splitWorkYears <- c(0, splitWorkYears)
workYearLevels <- c("No experience", paste(workYearLevels, "years"))
} else {
workYearLevels <- paste(workYearLevels, "years")
}
workExpGroup <- rep(NA, length(work_years))
for(i in seq_along(workYearLevels)){
workExpGroup <- ifelse(work_years >= splitWorkYears[i] & work_years < splitWorkYears[i+1], workYearLevels[i], workExpGroup)
}
workExpGroup %>% ordered(workYearLevels)
}
job_positions <- function(num_jobs, max_jobs = 13) {
num_jobs <- as.numeric(num_jobs)
num_jobs[which(num_jobs > max_jobs)] <- max_jobs
job_levels <- sort(num_jobs) %>% unique
job_levels <- ifelse(job_levels < max_jobs, as.character(job_levels), paste0(as.character(job_levels), "+"))
job_levels <- ifelse(job_levels == 1, paste(job_levels, "position"), paste(job_levels, "positions"))
num_jobs <- ifelse(num_jobs < max_jobs, as.character(num_jobs), paste0(as.character(num_jobs), "+"))
num_jobs <- ifelse(num_jobs == 1, paste(num_jobs, "position"), paste(num_jobs, "positions"))
num_jobs %>% ordered(job_levels)
}
get_word_freqs <- function(dt, textCol = "text", groupCol = "group", stopWords = stop_words) {
setnames(dt, c(textCol, groupCol), c("text", "group"))
freqs <- dt %>%
unnest_tokens(word, text) %>%
anti_join(stopWords) %>%
count(group, word) %>%
complete(group, word, fill = list(n = 0)) %>%
group_by(group) %>%
mutate(total = sum(n), percent = n / total) %>%
ungroup()
setnames(dt, c("text", "group"), c(textCol, groupCol))
freqs
}
get_word_models <- function(dt, textCol = "text", groupCol = "group", freqs = NA, min_n = 50) {
setnames(dt, c(textCol, groupCol), c("text", "group"))
if(is.na(freqs))freqs <- get_word_freqs(dt)
models <- freqs %>%
group_by(word) %>%
filter(sum(n) > min_n) %>%
do(tidy(glm(cbind(n, total - n) ~ group, ., family = "binomial"))) %>%
ungroup()
setnames(dt, c("text", "group"), c(textCol, groupCol))
models
}
findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
as.data.table()
ngrams[order(-doc_count)][, .(term, count = doc_count)]
}
getStopwords <- function(locale) {
stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball"))
stopWords <- ""
if (locale %in% stopwordsLocale)
stopWords <- locale %>% stopwords
stopWords
}
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1) {
tokensList <- strsplit(corpus[, text], " ")
names(tokensList) <- corpus[, code]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf)]
}
igraph_from_arules <- function(x){
itemNodes <- which(itemFrequency(items(generatingItemsets(x)), type = "absolute") > 0)
assocNodes <- paste("assoc", 1:length(x), sep='')
lhs <- LIST(lhs(x), decode=FALSE)
from_lhs <- unlist(lhs)
to_lhs <- assocNodes[rep(1:length(x), sapply(lhs, length))]
rhs <- LIST(rhs(x), decode=FALSE)
to_rhs <- unlist(rhs)
from_rhs <- assocNodes[rep(1:length(x), sapply(rhs, length))]
type <- c(rep(1, length(itemNodes)), rep(2, length(assocNodes)))
nodeLabels <- c(itemLabels(x)[itemNodes], rep("", length(assocNodes)))
e.list <- cbind(c(from_lhs, from_rhs), c(to_lhs, to_rhs))
v.labels <- data.frame(
name = c(as.character(itemNodes), assocNodes),
label = nodeLabels,
stringsAsFactors = FALSE)
g <- igraph::graph.data.frame(e.list, directed=TRUE, vertices=v.labels)
## add quality measures
for(m in names(quality(x))) {
g <- igraph::set.vertex.attribute(g, m, which(type==2),
quality(x)[[m]])
}
return(g)
}
timeNeeded <- (proc.time()[3] - source.starting.time);
../000.scripts/data/data_wrangle.R completed in 0.03 seconds
Type of data: data.table, data.frame.
Dimensions: 1635291, 16.
Column Names: education_field, eqf_granted, institution, institution_country, enrollment_year, graduation_year, locale, country, birth_year, gender, nationality, mother_tongue, headline_job, headline_isco, eqf_highest, responses.
1.get.data.R completed in 1.45 seconds
educationDT[!is.na(graduation_year) & !is.na(enrollment_year), study_years := as.numeric(graduation_year) - as.numeric(enrollment_year)]
educationDT[is.na(graduation_year) & !is.na(enrollment_year), study_years := 2019 - as.numeric(enrollment_year)]
educationDT[study_years == 0, study_years := 1]
educationDT[study_years > 15 | study_years < 0, study_years := NA]
educationDT[, education_field := factor(education_field)]
educationDT[, eqf_granted := eqf_level_name(eqf_granted)]
educationDT[, institution_country := country_name(institution_country)]
educationDT[, enrollment_year := ordered(enrollment_year)]
educationDT[, graduation_year := ordered(graduation_year)]
educationDT[, country := country_name(country)]
educationDT[, age_group1 := age_groups(2019 - as.numeric(birth_year))]
educationDT[, age_group2 := age_groups(2019 - as.numeric(birth_year), c("0-20", "21-25", "26-30", "31-35", "36+"), supremum = Inf)]
educationDT[, gender := gender_name(gender)]
educationDT[, nationality := nationality_name(nationality)]
educationDT[, mother_tongue := factor(mother_tongue)]
educationDT[, locale := language_name(locale)]
aggregate_cols <- c(
"education_field", "eqf_granted", "institution_country", "study_years", "enrollment_year", "graduation_year", "locale", "country", "age_group1", "age_group2", "gender", "nationality", "mother_tongue"
)
fieldsIndicator <- educationDT[, .SD, .SDcols = c(aggregate_cols, "responses")]
fieldsIndicator <- fieldsIndicator[, .(responses = sum(responses)), by = aggregate_cols] %>% setorderv(aggregate_cols, na.last = TRUE)
2.process.data.R completed in 23.76 seconds
fst
formatfileName <- "education_fields_indicator.fst"
saveBinary(fieldsIndicator, fileName, outputRepo, format = "fst")
Datasource : /data/generic/jobsOutput/code_book/indicators/education_fields_indicator.fst of 9,769,287 bytes.
csv
formatfileName <- "/data/tmpfs/results/survey_data/csv/education_fields_indicator.zip"
fwrite_zip(fieldsIndicator, fileName, quote = TRUE)
Datasource : /data/tmpfs/results/survey_data/csv/education_fields_indicator.zip of 8,105,205 bytes.
3.save.data.R completed in 12.5 seconds
Completed in 38.33 seconds.