Introduction
This is a break down of the topic Forum:Exploratory Data Analysis.
This entry has the exploratory data analysis scripts coded in R (programming language).
Note: The regular expression looks for "{ {sidebar" up to "\n}}". If the link end is only "}}" then it would get either until the whole end of the text, or cut short by some links inside the sidebar itself. The consequence is that sidebar templates that do NOT use the indicated formatting and do NOT end with a closing line with the "}}", will be missed.
Data
Data source as of 2015-11-16, dowloaded with dumpgenerator.py from https://github.com/WikiTeam/wikiteam
10-check-tag-variables.R
This script loads the .xml file and goes over all the pages looking for a "sidebar" string. If found it check all the tags for a particular type of sidebar, and counts the occurrences of each type as well. There are some statements that filter the scope, as that is my personal focus.
# 10-check-tag-variables.R # 2015.11.16 require(XML) require(stringi) # dump file from 2015.11.16 stxml <- xmlParse("./memory_alphawikiacom-20151116-current.xml") xmltop <- xmlRoot(stxml) # content of root sidebar_tags <- list() sidebar_count <- list() n <- xmlSize(xmltop) # number of nodes # for all pages (all nodes skipping the first one) for (i in 2:n) { # get the text in the revision in the page text <- xmlValue(xmltop[[i]][["revision"]][["text"]]) # get the namespace ns <- xmlValue(xmltop[[i]][["ns"]]) # filter a sidebar sidebar <- stri_match_all_regex(text, pattern = "\\{\\{[Ss]idebar\\s*(.*?)\\n\\}\\}", opts_regex = stri_opts_regex(dotall = TRUE))[[1]][, 2] # if found a sidebar in the main namespace if (!is.na(sidebar[1]) & ns == "0") { # remove the \n sidebar <- stri_replace_all_regex(sidebar, "\\n", "", vectorize_all = FALSE) # remove links and templates marks sidebar <- stri_replace_all_regex(sidebar, "[\\[\\]]", "", vectorize_all = FALSE) sidebar <- stri_replace_all_regex(sidebar, "\\{\\{(.*?)\\|(.*?)\\}\\}", "$2", vectorize_all = FALSE) sidebar <- stri_replace_all_regex(sidebar, "\\{\\{(.*?)\\}\\}", "$1", vectorize_all = FALSE) # remove html comments sidebar <- stri_replace_all_regex(sidebar, "<br\\s*/?>", ";", opts_regex = stri_opts_regex(case_insensitive = TRUE), vectorize_all = FALSE) sidebar <- stri_replace_all_regex(sidebar, "<.*?>", "", vectorize_all = FALSE) # split the type of sidebar and the tags tmp <- stri_match_all_regex(sidebar, pattern = "([\\w/]*)\\|(.*)", opts_regex = stri_opts_regex(dotall = TRUE))[[1]] # collect the sidebar type sidebar_type <- tolower(tmp[1, 2]) if (sidebar_type %in% c("individual", "planet", "species", "starship")) { # collect the remaining text tmp <- tmp[1, 3] # split fields tmp <- stri_split_fixed(tmp, pattern = "|", opts_regex = stri_opts_regex(dotall = TRUE))[[1]] # remove empty fields tmp <- tmp[tmp != ""] # remove leading and trailing spaces tmp <- stri_trim(tmp) # split the tags and their values tmp <- do.call(rbind,stri_match_all_regex(tmp, pattern = "(.*?)\\s*=\\s*(.*)")) # collect the tags tags <- tmp[, 2] # collect the tags' values # values <- stri_replace_all_regex(tmp[, 3], "(\"|'')", "") # accumulate the tags for a type of sidebar sidebar_tags[[sidebar_type]] <- unique(c(sidebar_tags[[sidebar_type]], tags)) } if (is.null(sidebar_count[[sidebar_type]])) { sidebar_count[[sidebar_type]] <- 1 } else { sidebar_count[[sidebar_type]] <- sidebar_count[[sidebar_type]] + 1 } } if (i %% 5000 == 0) { print(sprintf("%d / %d", i, n)) } } print(sprintf("%d / %d", i, n)) print(lapply(sidebar_tags, sort)) cat("{| class=\"grey\"") cat("|+ Statistics for the sidebar counts") cat("|-") cat("! Variable") cat("! Count") for (tag in sort(names(sidebar_count))) { cat("|-", sep = "\n") cat(paste0("| ", tag), sep = "\n") cat(paste0("| ", sidebar_count[[tag]]), sep = "\n") } cat("|}")
Output
$planet [1] "Affiliation" "Class" "datestatus" "imageOrbital" [5] "imageSurface" "is-format" "Location" "Name" [9] "Native Species" "OrbitalCap" "Rotational Period" "Satellites" [13] "Status" "SurfaceCap" "System" "Type" $individual [1] "actor" "affiliation" "blood type" "born" "captains_woman" [6] "children" "datestatus" "died" "father" "gender" [11] "height" "image" "image2" "image3" "imagecap" [16] "imagecap2" "imagecap3" "marital_status" "mother" "occupation" [21] "owner" "rank" "relative" "serial number" "sibling" [26] "species" "spouse" "status" "weight" $starship [1] "Affiliation" "Class" "Datestatus" "dt" "image" "image2" [7] "imagecap" "imagecap2" "Launched" "Logo" "Name" "operator" [13] "owner" "Registry" "Status" $species [1] "date" "image" "image2" "image3" "imagealt" "imagealt2" "imagecap" [8] "imagecap2" "imagecap3" "logo" "logoalt" "name" "planet" "planet2" [15] "pop" "quadrant" "quadrant2" "type"
Variable | Count |
---|---|
actor | 722 |
audio | 18 |
book | 370 |
calendar | 121 |
cards | 88 |
cd | 9 |
class | 262 |
comic | 869 |
conflict | 51 |
crew | 313 |
element | 129 |
episode | 716 |
fictional | 16 |
film | 13 |
government | 23 |
hologram | 57 |
individual | 1608 |
magazine | 413 |
novel | 830 |
planet | 382 |
rank | 22 |
series | 203 |
soundtrack | 56 |
species | 152 |
starship | 684 |
station | 43 |
strip | 68 |
video | 355 |
videogame | 90 |
xindi | 1 |
year | 137 |
20-extract-sidebars.R
# 20-extract-sidebars.R # 2015.11.16 require(XML) require(stringi) # dump file from 2015.11.16 stxml <- xmlParse("./memory_alphawikiacom-20151116-current.xml") xmltop <- xmlRoot(stxml) # content of root individual <- list() planet <- list() species <- list() starship <- list() n <- xmlSize(xmltop) for (i in 2:n) { # get the text in the revision in the page title <- xmlValue(xmltop[[i]][["title"]]) text <- xmlValue(xmltop[[i]][["revision"]][["text"]]) # get the namespace ns <- xmlValue(xmltop[[i]][["ns"]]) # filter a sidebar sidebar <- stri_match_all_regex(text, pattern = "\\{\\{[Ss]idebar\\s*(.*?)\\n\\}\\}", opts_regex = stri_opts_regex(dotall = TRUE))[[1]][, 2] # if found a sidebar in the main namespace if (!is.na(sidebar[1]) & ns == "0") { # remove the \n sidebar <- stri_replace_all_regex(sidebar, "\\n", "", vectorize_all = FALSE) # remove links and templates sidebar <- stri_replace_all_regex(sidebar, "[\\[\\]]", "", vectorize_all = FALSE) sidebar <- stri_replace_all_regex(sidebar, "\\{\\{(.*?)\\|(.*?)\\}\\}", "$2", vectorize_all = FALSE) sidebar <- stri_replace_all_regex(sidebar, "\\{\\{(.*?)\\}\\}", "$1", vectorize_all = FALSE) # remove html comments sidebar <- stri_replace_all_regex(sidebar, "<br\\s*/?>", ";", opts_regex = stri_opts_regex(case_insensitive = TRUE), vectorize_all = FALSE) sidebar <- stri_replace_all_regex(sidebar, "<.*?>", "", vectorize_all = FALSE) # split the type of sidebar and the tags tmp <- stri_match_all_regex(sidebar, pattern = "(\\w*)\\|(.*)", opts_regex = stri_opts_regex(dotall = TRUE))[[1]] # collect the sidebar type sidebar_type <- tolower(tmp[1, 2]) if (sidebar_type %in% c("individual", "planet", "species", "starship")) { # collect the remaining text tmp <- tmp[1, 3] # split fields tmp <- stri_split_fixed(tmp, pattern = "|", opts_regex = stri_opts_regex(dotall = TRUE))[[1]] # remove empty fields tmp <- tmp[tmp != ""] # remove leading and trailing spaces tmp <- stri_trim(tmp) # split the tags and their values tmp <- do.call(rbind,stri_match_all_regex(tmp, pattern = "(.*?)\\s*=\\s*(.*)")) # collect the tags tags <- tmp[, 2] # collect the tags' values values <- stri_replace_all_regex(tmp[, 3], "(\"|'')", "") if (sidebar_type == "individual") { individual[["Title"]] <- c(individual[["Title"]], title) for (tag in sort(sidebar_tags[[sidebar_type]])) { individual[[tag]] <- c(individual[[tag]], ifelse(length(values[which(tags == tag)]) == 0, NA,values[which(tags == tag)])) } } if (sidebar_type == "planet") { planet[["Title"]] <- c(planet[["Title"]], title) for (tag in sort(sidebar_tags[[sidebar_type]])) { planet[[tag]] <- c(planet[[tag]], ifelse(length(values[which(tags == tag)]) == 0, NA,values[which(tags == tag)])) } } if (sidebar_type == "species") { species[["Title"]] <- c(species[["Title"]], title) for (tag in sort(sidebar_tags[[sidebar_type]])) { species[[tag]] <- c(species[[tag]], ifelse(length(values[which(tags == tag)]) == 0, NA,values[which(tags == tag)])) } } if (sidebar_type == "starship") { starship[["Title"]] <- c(starship[["Title"]], title) for (tag in sort(sidebar_tags[[sidebar_type]])) { starship[[tag]] <- c(starship[[tag]], ifelse(length(values[which(tags == tag)]) == 0, NA,values[which(tags == tag)])) } } } } if (i %% 5000 == 0) { print(sprintf("%d / %d", i, n)) } } print(sprintf("%d / %d", i, n)) individual <- data.frame(individual, stringsAsFactors = FALSE) individual <- individual[!grepl("(Template:|Talk:|User:)", individual$Title), ] write.csv(file = "individual.csv", x = individual, quote = TRUE, row.names = FALSE) planet <- data.frame(planet, stringsAsFactors = FALSE) planet <- planet[!grepl("(Template:|Talk:)", planet$Title), ] write.csv(file = "planet.csv", x = planet, quote = TRUE, row.names = FALSE) species <- data.frame(species, stringsAsFactors = FALSE) species <- species[!grepl("(Template:|Talk:)", species$Title), ] write.csv(file = "species.csv", x = species, quote = TRUE, row.names = FALSE) starship <- data.frame(starship, stringsAsFactors = FALSE) starship <- starship[!grepl("(Template:|Talk:)", starship$Title), ] write.csv(file = "starship.csv", x = starship, quote = TRUE, row.names = FALSE)
Output
None
Discussion
Updated with data from a database dump taken on the 2015.09.30, -- DataScientist (talk) 03:13, October 2, 2015 (UTC)
- Note that a couple of small changes can be made to improve the scripts a bit:
- The sidebar pattern can be simplified to:
"\\{\\{[Ss]idebar\\s*(.*?)\\n\\}\\}"
- This can be done because (other than the first letter), template calls are case sensititve.
- The pattern for matching the type of sidebar can also be simplified, as some have multiple words (ie "comic series" vs "novel series"). This can be done from
"([\\w/]*)\\|(.*)"
to"(.*?)\\|(.*)"
.
- The sidebar pattern can be simplified to:
- I don't see an easy (read simple) way to do a clever loop and create a CSV file for each specific sidebar type that's been found. If that could be done, then I think that'd be a huge win in order to do a lot of data cleanup on the system.