Abstract
This process loads, cleans and transforms previously scrapped data to create a corpus for each EQF level across the different locales. The corpora of the most important locales will be supplemented using machine translation. The resulting data set is saved into binary format for further analysis.# http://rmarkdown.rstudio.com/html_document_format.htm
sourceTimeNeeded <- c(0);
source.starting.time <- proc.time()[3]
Information about the libraries, environment, sources used and their execution is reported. Aditional information is provided within section tabs. Navigating through the report is also possible through the table of contents. Tables reported, can be dynamically filtered, searched ordered and exported into various formats.
R.Version()$version.string
## [1] "R version 3.4.4 (2018-03-15)"
librariesVersion <- c()
for(i in 1:length(libraries))
librariesVersion <- c(librariesVersion, paste(packageVersion(libraries[i] )))
librariesLoaded <- lapply(libraries, require, character.only = TRUE)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: DT
## Loading required package: stringr
## Loading required package: xml2
## Loading required package: httr
## Loading required package: magrittr
## Loading required package: text2vec
## text2vec is still in beta version - APIs can be changed.
## For tutorials and examples visit http://text2vec.org.
##
## For FAQ refer to
## 1. https://stackoverflow.com/questions/tagged/text2vec?sort=newest
## 2. https://github.com/dselivanov/text2vec/issues?utf8=%E2%9C%93&q=is%3Aissue%20label%3Aquestion
## If you have questions please post them at StackOverflow and mark with 'text2vec' tag.
## Loading required package: stopwords
## Loading required package: cld2
## Loading required package: cld3
##
## Attaching package: 'cld3'
## The following objects are masked from 'package:cld2':
##
## detect_language, detect_language_mixed
## Loading required package: parallel
timeNeeded <- (proc.time()[3] - source.starting.time);
../00.core/00.01.libraries.R completed in 2.01 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
## Base functions
# EPAS-DS
# @authors ds@eworx.gr
repository <- "/data/generic/"
getSourcePath <- function(filename, baseFolder = repository){
return(paste(baseFolder, filename, sep = ""))
}
readData <- function(filename, colClasses = c(), baseFolder = repository, header = TRUE, sep = "\t", encoding = "UTF-8", stringsAsFactors = TRUE, na.strings = c("", "NULL"), verbose = FALSE){
if(length(colClasses) == 0)
return (data.table::fread(input = getSourcePath(filename, baseFolder), header = header, sep = sep, encoding = encoding, stringsAsFactors = stringsAsFactors, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
return (data.table::fread(input = getSourcePath(filename, baseFolder), colClasses = colClasses, header = header, sep = sep, encoding = encoding, verbose = verbose, showProgress = TRUE, na.strings = na.strings ) )
}
#rds for small disk space & fst for fast load
saveBinary <- function(data, filename = filename, baseFolder = repository, format = "rds"){
fileName <- getSourcePath(filename, baseFolder)
dir.create(dirname(fileName), recursive = TRUE, showWarnings = FALSE)
if(format == "rds") saveRDS(data, fileName)
if(format == "fst") fst::write_fst(data, fileName)
}
#alternative for rough read write operations
saveRDS_ <- function(object, file){
dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
saveRDS(object, file)
}
#rds for small disk space & fst for fast load
loadBinary <- function(filename, baseFolder = repository, format = "rds", as.data.table = TRUE){
if(format == "rds"){return(readRDS(getSourcePath(filename, baseFolder)))}
if(format == "fst"){return(fst::read_fst(getSourcePath(filename, baseFolder), as.data.table = as.data.table))}
}
rowColumns <- function(data){
return(paste( format(nrow(data), big.mark=","), "Rows X ", ncol(data), "Columns"))
}
publishIncludeCss <- function(){
sourceFile <- "/data/jobs/wp41.analysis/000.core/include.css"
destinatinoFile <- "/data/tmpfs/results/include.css"
if (!file.exists(destinatinoFile)) {
return (file.copy(sourceFile, destinatinoFile))
}else{
return(TRUE);
}
}
#as the mountstorage is on memory make sure the asset include.css is there.
summariseTable <- function(data){
return(data.frame(unclass(summary(data)), check.names = FALSE, stringsAsFactors = FALSE))
#return(do.call(cbind, lapply(data, summary)))
}
factoriseCharacterColumns <- function(data){
for(name in names(data)){
if( class(data[[name]]) =="character"){
data[[name]] <- as.factor(data[[name]])
}
}
return(data)
}
############################
# https://rstudio.github.io/DT/010-style.html
#https://rpubs.com/marschmi/RMarkdown
capitalise <- function(x) paste0(toupper(substring(x, 1, 1)), substring(x, 2, nchar(x)))
styliseDTNumericalColumn <- function(data, result, columnName, color, columnsName_original ){
if(columnName%in% columnsName_original){
result <- result %>% formatStyle(
columnName,
background = styleColorBar(data[[columnName]], color),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
}
return(result)
}
reportTabularData <- function(data, anonymize=TRUE){
if(anonymize)return()
columnsName <- names(data)
columnsName <- lapply(columnsName, capitalise)
columnsName_original <- names(data)
result <-
DT::datatable(
data,
class = 'cell-border stripe',
filter = 'top',
rownames = FALSE,
colnames = columnsName,
extensions = 'Buttons',
options = list(
pageLength = 20,
columnDefs = list(list(className = 'dt-left', targets = "_all")),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf'),
searchHighlight = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'border': '1px solid'});",
"}"
)
)
)
result <- styliseDTNumericalColumn(data,result, "Count", 'steelblue', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "sourceTimeNeeded", '#808080', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "timeNeeded", '#808080', columnsName_original)
#result <- styliseDTNumericalColumn(data,result, "percentMatch", '#5fba7d', columnsName_original)
result <- styliseDTNumericalColumn(data,result, "percentMatch", '#4682b4', columnsName_original)
return(result)
}
#data persistance info
reportFileInfo <- function(filename, baseFolder = repository) {
paste0(
getSourcePath(filename, baseFolder), " of size ",
utils:::format.object_size(file.size(getSourcePath(filename, outputRepo)) + 1000000, "auto"))
}
fonts <- list(
sans = "DejaVu Serif",
mono = "DejaVu Serif",
`Times New Roman` = "DejaVu Serif"
)
#read_xml_to_list <- function(filepath, is.gz = FALSE){
# if(is.gz){
# temp_data <- paste0(repository, "data/delete.me")
# result <- xmlToList(xmlParse(gunzip(filepath, destname = temp_data, remove =FALSE)))
# Sys.chmod(file.path(temp_data), "777", use_umask = FALSE)
# unlink(temp_data)
# result
# }else{
# xmlToList(xmlParse(filepath))
# }
#}
#transpose_list_to_dt <- function(data_list){
# dt <- t(as.data.table(data_list))
# dt <- as.data.table(dt)
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# dt[, (names(dt)) := lapply(.SD, unlist), .SDcols = 1:ncol(dt)]
# names(dt) <- names(data_list[[1]])
# dt
#}
cleansingCorpus <- function(
htmlString, rem.html =TRUE, rem.http = TRUE, rem.newline = TRUE,
rem.nonalphanum = TRUE, rem.longwords = TRUE, rem.space = TRUE,
tolower = TRUE, add.space.to.numbers = TRUE, rem.country.begin = FALSE,
rem.nonalphanum.begin = FALSE, rem.space.begin = FALSE, fix.greek = TRUE
){
if(rem.html){text <- gsub("<.*?>", " ", htmlString)} # removing html commands
if(rem.http){text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ", text)} #removing http destinations
if(rem.newline){text <- gsub("[\r\n\t]", " ", text)}
if(rem.nonalphanum){text <- gsub("[^[:alpha:]]", " ", text)} #removing non-alphanumeric
if(rem.longwords){text <- gsub("\\w{35,}", " ", text)} ##Removing words with more than 30 letters
if(rem.space){text <- gsub("\\s+", " ", text)} #removing excess space
if(tolower){text <- tolower(text)}
if(add.space.to.numbers){ #add space between number and letters
text <- gsub("([0-9])([[:alpha:]])", "\\1 \\2", text)
text <- gsub("([[:alpha:]]|[.])([0-9])", "\\1 \\2", text)
}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.country.begin){text <- gsub("^EU", "", text)} #remove country codes from the beginning of the text
if(rem.nonalphanum.begin){text <- gsub("^[?–-]*", "", text)} #remove special characters identified in the beginning of text
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(rem.space.begin){text <- gsub("^[[:space:]]*", "", text)}
if(fix.greek){
text <- gsub("ς", "σ", text)
text <- gsub("ά", "α", text)
text <- gsub("έ", "ε", text)
text <- gsub("ή", "η", text)
text <- gsub("ί", "ι", text)
text <- gsub("ύ", "υ", text)
text <- gsub("ό", "ο", text)
text <- gsub("ώ", "ω", text)
}
trimws(text)
}
cleansingEducationCorpus <- function(text) {
text <- gsub("\\.", "", text) #removing periods
text <- gsub("[[:punct:]]", " ", text) #removing other punctuation
text <- gsub("\\s+", " ", text) #removing excess space
text <- tolower(text) #changing case to lower
#removing accent from Greek
text <- gsub("ς", "σ", text)
text <- gsub("ά", "α", text)
text <- gsub("έ", "ε", text)
text <- gsub("ή", "η", text)
text <- gsub("ί", "ι", text)
text <- gsub("ύ", "υ", text)
text <- gsub("ό", "ο", text)
text <- gsub("ώ", "ω", text)
trimws(text) #trimming white-space
}
#This function removes dates that are "relics" from the xml parsing
removeDates <- function(text){
days <- "(Sunday,|Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,)"
months <- "(January|February|March|April|May|June|July|August|September|October|November|December|Months)"
date_form1 <- paste(days, months, "([0-9]|[0-9][0-9]), [0-9][0-9][0-9][0-9]")
date_form2 <- "\\?[0-9][0-9][0-9][0-9]"
text <- gsub(date_form1, " ", text)
gsub(date_form2, " ", text)
}
xmlToDataTable <- function(xmlData, itemNames){
itemList <- lapply(itemNames,
function(x){
xml_text(xml_find_all(xmlData, paste0(".//item/", x)))
}
)
names(itemList) <- xmlItems
as.data.table(itemList)
}
cleanCorpusHtml <- function(text){
unlist(lapply(text, function(x){
if(nchar(x) > 0){
# because nodes were starting with tag keywords in li, we relocate at the end so the information remains and the description
# starts with the main content
html <- gsub(">","> ", x) # add spaces after html tags so these aren't concatenated
xml <- read_xml(html, as_html = TRUE)
lis <- xml_find_all(xml, ".//li")
xml_remove(lis)
text <- paste( paste(xml_text(xml), collapse ="") , paste(xml_text(lis) , collapse =""), collapse ="")
text <- gsub("\\s+"," ", text)
}else {""}
}))
}
#Split equally a vector into chunks of number n_chunks
equal_split <- function(vct, n_chunks) {
lim <- length(vct)
fstep <- lim%/%n_chunks
idx_list <- list()
for(i in seq(n_chunks - 1)){
idx_list[[i]] <- vct[((i-1)*fstep + 1):(i*(fstep))]
}
idx_list[[n_chunks]] <- vct[((n_chunks - 1)*fstep + 1):(lim)]
return(idx_list)
}
#Function that takes a vector, and returns thresholded first 10 sorted indexes
getThresholdOrderRwmd <- function(vct, idVec, threshold = 1e-6, numHead = 10){
vct <- ifelse(vct > threshold, vct, Inf)
indexVec <- head(order(vct), numHead)
idVec[indexVec]
}
#Function to read xml nodes in description
maintainElements <- function(nodes, elementType = "a", attribute = "href"){
xml_attr(xml_find_all(nodes, paste0(".//", elementType)), attribute)
}
#Function to add results to datatable
elementsToDataTable <- function(result, elementType){
if(length(result) > 0)
data.table(elementType = elementType, attributeValue = result)
else
data.table()
}
#Function to retrieve urls from text
keepHtmlElements <- function(feedItem){
nodes <- read_xml(paste0("<div>", feedItem, "</div>"), as_html = TRUE)
rbind(
elementsToDataTable(maintainElements(nodes, "a", "href"), "link"),
elementsToDataTable(maintainElements(nodes, "img", "src"), "image"),
elementsToDataTable(maintainElements(nodes, "img-src", "src"), "image")
#All "img-src" are NA
)
}
#retrieve list of parameters in a http request query
getQueryParams <- function(url){
query <- httr::parse_url(url)$query
queryValues <- unlist(query)
queryNames <- names(query)
dat <- data.table(varName = queryNames, value = queryValues)
dat[queryValues != ""]
}
keepCountryName <- function(string){
string <- gsub(".*_", "", string)
gsub("\\..*", "", string)
}
keepNTokens <- function(string, num){
tokenList <- strsplit(string, split = " ")
sapply(tokenList, function(tokens){
tokens <- sort(tokens)
tokensShift <- shift(tokens, -num, fill = FALSE)
paste(tokens[tokens != tokensShift], collapse = " ")
})
}
#Language detection
detectLanguage <- function(text, precision = 3) {
if (precision < 1) return(cld3::detect_language(text))
else if (precision == 2) return(cld2::detect_language(text))
pred <- cld2::detect_language(text)
ifelse(pred == cld3::detect_language(text), pred, NA_character_)
}
###################################################################################################
# Functions related to ESCO qualifications scrapping
###################################################################################################
# Returns a vector of the URIs in a particular results page for a given search query
getPageQualificationURIs <- function(locale = "en", eqfLevels = 1:8, pageNum = 1) {
resultsPageURL <- paste0(
"https://ec.europa.eu/esco/portal/qualificationSearch?",
"conceptLanguage=", locale,
"&searchTerm=",
"&eqfFilters=", paste(eqfLevels, collapse = ","),
"&page=", pageNum
) %>% url() # for Windows
resultsHTML <- read_html(resultsPageURL) %>%
xml_find_all(".//div[@class='content']") %>%
xml_children() %>%
xml_children() %>%
xml_attrs()
qualificationURIs <- grep(pattern = "http", resultsHTML, value = TRUE)
uriHead <- regexpr(pattern = "http", qualificationURIs)
uriTail <- regexpr(pattern = "');", qualificationURIs) - 1
substr(qualificationURIs, uriHead, uriTail)
}
# Returns total number of pages for a given search query
getNumOfPages <- function(locale = "en", eqfLevels = 1:8) {
resultsPageURL <- paste0(
"https://ec.europa.eu/esco/portal/qualificationSearch?",
"conceptLanguage=", locale,
"&searchTerm=",
"&eqfFilters=", paste(eqfLevels, collapse = ","),
"&page=", 1
) %>% url() #for Windows
numOfQuals <- read_html(resultsPageURL) %>%
xml_find_all(".//h1") %>% xml_text() %>% as.numeric()
qualsPerPage <- length(getPageQualificationURIs())
round(0.5 + numOfQuals / qualsPerPage)
}
# Returns HTML of a qualification based on its URI for a given locale
getQualificationHTML <- function(uri, locale = "en") {
paste0(
"https://ec.europa.eu/esco/portal/qualificationDetails?",
"conceptLanguage=", locale,
"&uri=", uri
) %>% url() %>% #for Windows
read_html()
}
# Parses an `xml_nodeset` object to extract useful data
parseXMLNodeSets <- function(xmlNodeSets) {
lapply(xmlNodeSets, function(x) {
labelText <- x %>% xml_find_all(".//p[@class='label']") %>% xml_text
allXml <- x %>% xml_find_all(".//p | .//ul")
varsIndex <- grep("class=\"label\"", allXml)
textData <- allXml %>% xml_text
limits <- c(varsIndex, length(textData) + 1)
data <- lapply(seq_along(varsIndex) , function(i) {
toPaste <- head(limits[i]:limits[i+1], -1)[-1]
paste(textData[toPaste], collapse = " ")
}) %>% as.data.table
setnames(data, textData[varsIndex])
titleText <- x %>% xml_find_all(".//h1") %>% xml_text
data[, "Title"] <- titleText[1]
data
}) %>% rbindlist(fill = TRUE)
}
# Extracts values from text structured in a "Label: Value" format
valuesFromLabeledText <- function(labeledText, label, otherLabels = c()){
cleanTokens <- labeledText %>%
paste("ENDLABEL:") %>%
cleansingCorpus(tolower = FALSE, rem.longwords = FALSE, rem.http = FALSE, rem.nonalphanum = FALSE) %>%
space_tokenizer()
label <- gsub("$", ":", label)
otherLabels <- gsub("$", ":", otherLabels)
otherLabels <- c("ENDLABEL:", otherLabels[label != otherLabels], label)
lapply(cleanTokens, function(tokens){
textStart <- which(tokens %in% label) + 1
textOther <- which(tokens %in% otherLabels) - 1
lapply(textStart, function(start){
fin <- textOther[which(textOther >= start) %>% min]
tokens[start:fin] %>% paste(collapse = " ")
}) %>% unlist() %>% paste(collapse = ", ")
}) %>% unlist() %>% paste0(",")
}
###################################################################################################
# Functions related to TF-IDF calculation
###################################################################################################
# The augumented frequency is used to prevent bias towards longer documents. This choice is
# justified by the fact that corpus size follows a roughly guassian distribution with respect to
# EQF level.
# The smooth inverse document frequency is used to prevent IDF from nullifying the TF-IDF in cases
# where TF can provide useful insigned on its own. That resolves the edge case where a word
# appears in low frequency in every corpus, but in a significantly high frequency in one corpus.
findTFIDF <- function(corpus, stopwords, normalize = "double", min_char = 1, by.class = "class", threshold = -1) {
tokensList <- strsplit(corpus[, value], " ")
names(tokensList) <- corpus[, get(by.class)]
tokensDT <- lapply(tokensList, as.data.table) %>%
rbindlist(idcol = TRUE) %>%
setnames(c("class", "term"))
tokensDT <- tokensDT[!term %in% stopwords][nchar(term) > min_char]
#inverse document frequency smooth
idfDT <- tokensDT[!duplicated(tokensDT)][, .(docFreq = .N), by = "term"]
idfDT[, idf := log(length(unique(tokensDT$class)) / (docFreq + 1)) + 1]
tfDT <- tokensDT[, .(term_count = .N), by = c("class", "term")]
if(threshold > 0)tfDT[term_count > threshold, term_count := threshold]
if(normalize == "double")tfDT[, tf := 0.5 + 0.5 * term_count / max(term_count)]
if(normalize == "log")tfDT[, tf := log(1 +term_count)]
merge(tfDT, idfDT, on = "term")[, tfIdf := tf*idf ][, .(term, class, tfIdf, docFreq)]
}
#silence warnings pipe
`%W>%` <- function(lhs,rhs){
w <- options()$warn
on.exit(options(warn=w))
options(warn=-1)
eval.parent(substitute(lhs %>% rhs))
}
getStopwords <- function(locale) {
stopwordsLocale <- c(stopwords_getlanguages(source = "misc"), stopwords_getlanguages(source = "snowball"))
stopWords <- ""
if (locale %in% stopwordsLocale)
stopWords <- locale %W>% stopwords
stopWords
}
###################################################################################################
# Text translation
###################################################################################################
translateText <- function(sourceText, sourceLang, translationLang, batchSize = 4800) {
if (length(sourceText) == 0) {
return ("")
} else if (length(sourceText) == 1) {
return (requestTranslation(sourceText, sourceLang, translationLang))
} else if (length(sourceText) > 4800) {
return (NA)
}
sourceQueries <- gsub("$", "\n >", sourceText)
sourceQueries <- gsub("^", "< \n", sourceQueries)
queries <- data.table(query = sourceQueries, size = nchar(sourceQueries), batch = nchar(sourceQueries))
for (row in seq(nrow(queries) - 1)) {
cumulativeSum <- queries[row, batch] + queries[row + 1, batch] + 3
queries[row + 1, batch := ifelse(cumulativeSum > batchSize, batch, cumulativeSum)]
}
batchStarts <- which(queries[, size] == queries[, batch])
batchFins <- c(batchStarts[-1] - 1, nrow(queries))
batches <- lapply(seq_along(batchStarts), function(i) batchStarts[i]:batchFins[i])
pastedQueries <- lapply(batches, function(batch) paste0(queries[batch, query], collapse = "\n")) %>% unlist()
translatedText <- lapply(pastedQueries, requestTranslation, sourceLang, translationLang) %>%
unlist() %>%
paste0(collapse = " ")
translatedText <- gsub("\\s?<", "", translatedText)
gsub(">$", "", translatedText) %>%
space_tokenizer(sep = ">") %>%
unlist() %>%
trimws()
}
requestTranslation <- function(sourceText, sourceLang, translationLang) {
googleTranslateURL <- paste0(
"https://translate.google.com/m",
"?hl=", sourceLang,
"&sl=", sourceLang,
"&tl=", translationLang,
"&ie=UTF-8&prev=_m&q=", URLencode(sourceText, reserved = TRUE)
)
GET(googleTranslateURL, add_headers("user-agent" = "Mozilla/5.0")) %>%
read_html() %>%
xml_child(2) %>%
xml_child(5) %>%
xml_text() %>%
unlist()
}
requestTranslation <- function(sourceText, sourceLang, translationLang) {
print("<test>")
}
###################################################################################################
# Text mining
###################################################################################################
findNGrams <- function(corpus, min_n, max_n = min_n, stopWords = NA_character_) {
ngrams <- itoken(corpus, tokenizer = word_tokenizer, progressbar = FALSE) %>%
create_vocabulary(stopwords = stopWords, c(min_n, max_n), sep_ngram = " ") %>%
as.data.table()
ngrams[order(-doc_count)][, .(term, count = doc_count)]
}
###########################################################################################################
(data.table(library = libraries, version = librariesVersion))
includeCssPublished <- publishIncludeCss()
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] cld3_1.3 cld2_1.2 stopwords_1.0 text2vec_0.5.1
## [5] magrittr_1.5 httr_1.4.1 xml2_1.2.2 stringr_1.4.0
## [9] DT_0.10 dplyr_0.8.3 rmarkdown_2.1 data.table_1.12.6
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 formatR_1.7 futile.logger_1.4.3
## [4] pillar_1.4.3 compiler_3.4.4 highr_0.8
## [7] futile.options_1.0.1 iterators_1.0.12 tools_3.4.4
## [10] digest_0.6.22 jsonlite_1.6 evaluate_0.14
## [13] lifecycle_0.2.0 tibble_3.0.1 lattice_0.20-38
## [16] pkgconfig_2.0.3 rlang_0.4.5 Matrix_1.2-14
## [19] foreach_1.4.7 mlapi_0.1.0 yaml_2.2.0
## [22] xfun_0.11 knitr_1.26 vctrs_0.2.4
## [25] htmlwidgets_1.5.1 grid_3.4.4 tidyselect_0.2.5
## [28] glue_1.3.1 R6_2.4.1 lambda.r_1.2.4
## [31] purrr_0.3.3 codetools_0.2-15 ellipsis_0.3.0
## [34] htmltools_0.4.0 assertthat_0.2.1 stringi_1.4.3
## [37] RcppParallel_4.4.4 crayon_1.3.4
timeNeeded <- (proc.time()[3] - source.starting.time);
../00.core/00.02.base.functions.R completed in 0.16 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
Data on the ESCO Qualifications has been scrapped in English, as well as on each qualification’s native country’s respective language. These two sets will be binded together and processed to create a corpus.
inputRepo <- getSourcePath("jobsOutput/collected_data/education/")
eqfEng <- loadBinary("escoQualifications.rds", inputRepo)
Type of data: data.table, data.frame.
Dimensions: 9606, 23.
Column Names: Field (ISCED FoET 2013), Country/Region, EQF level, Description of the qualification, Awarding body or competent authority, Data provider , URI, Information language, Title, Credit points, Further information on the qualification, Link to relevant supplements, Homepage of the qualification , Ways to acquire qualification, Notional workload needed to achieve the learning outcomes, Definition, Landing page of the qualification , Entry requirements, Relationship to occupations or occupational fields, Owner of the qualification, External quality assurance/regulatory body, Creator of the qualification , Alternative Title.
eqfLoc <- loadBinary("escoQualificationsLoc.rds", inputRepo)
Type of data: data.table, data.frame.
Dimensions: 9606, 24.
Column Names: Field (ISCED FoET 2013), Country/Region, EQF level, Description of the qualification, Awarding body or competent authority, Data provider , URI, Information language, Title, Homepage of the qualification , Further information on the qualification, Credit points, Link to relevant supplements, Ways to acquire qualification, Notional workload needed to achieve the learning outcomes, External quality assurance/regulatory body, Relationship to occupations or occupational fields, Owner of the qualification, Creator of the qualification , Alternative Title, Definition, Landing page of the qualification , Entry requirements, Locale.
timeNeeded <- (proc.time()[3] - source.starting.time);
1.get.data.R completed in 1.8 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
Qualification data, attributing the standardized EQF levels to specific types of education is currently in raw tabular form. To make it useful, the relevant columns need to be determined, cleaned from structural and HTML artifacts, and used to create a corpus.
eqfRelevant <- rbind(eqfEng, eqfLoc, fill = TRUE)[, c("EQF level", "Locale", "Title", "Alternative Title", "URI"), with = FALSE]
setnames(eqfRelevant, c("level", "locale", "title", "alt", "uri"))
eqfRelevant[, level := as.numeric(level)]
eqfRelevant[["uri"]] <- gsub("http://data.europa.eu/esco/resource/", "", eqfRelevant[["uri"]]) %>% trimws()
eqfCorpus <- melt(eqfRelevant, id.vars = c("level", "locale", "uri"), measure.vars = c("title", "alt"))
eqfCorpus <- eqfCorpus[!is.na(value)]
Certain qualifications indicate they are on a language other than their actual one. An attempt will be made to detect each language and make sure the corpus is as consistent as possible.
eqfCorpus[is.na(locale), localePred := detectLanguage(value)]
eqfCorpus[localePred == "en", locale := "en"]
eqfCorpus <- eqfCorpus[!is.na(locale)][, localePred := NULL]
importantLocales <- c("it", "pt", "ro", "es", "hu", "el")
The most common languages in Europass CVs have been determined to be en, it, pt, ro, es, hu, el
An augmented corpus will be formed using machine translations of the most complete locale available, which has been determined to be the English one. A translation approach using Google Translate has been chosen. Translations outputed in the web version of Google Translation will be scrapped.
translatedCorpus <- lapply(importantLocales, function(loc) {
lapply(1:8, function(lev) {
transCorpus <- eqfCorpus[locale == "en" & level == lev & variable == "title", value] %>% trimws()
data.table(level = lev, locale = loc, variable = "translation", value = translateText(transCorpus, "en", loc))
}) %>% rbindlist()
}) %>% rbindlist()
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
## [1] "<test>"
translatedCorpus[, uri := 1:.N, by = c("locale", "level")]
translatedCorpus[, uri := paste("t", locale, level, uri, sep = "_")]
eqfCorpus <- rbind(eqfCorpus, translatedCorpus[!is.na(value)])
timeNeeded <- (proc.time()[3] - source.starting.time);
2.process.data.R completed in 30.66 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.starting.time <- proc.time()[3]
outputRepo <- getSourcePath("jobsOutput/collected_data/education/")
filename <- "eqfCorpus.rds"
saveBinary(eqfCorpus, filename, outputRepo)
Datasource : /data/generic/jobsOutput/collected_data/education/eqfCorpus.rds of size 1.4 Mb.
reportTabularData(head(eqfCorpus, 30))
## NULL
timeNeeded <- (proc.time()[3] - source.starting.time);
3.save.data.R completed in 0.15 seconds
sourceTimeNeeded <- c( sourceTimeNeeded, timeNeeded)
source.blocks$sourceTimeNeeded <- sourceTimeNeeded;
Completed in 34.78 seconds.
reportTabularData(source.blocks);
## NULL