---
title: Rules Study Notebook
---
```{r}
#| label: setup
#| include: false
# load libraries for entire document (not cached)
library(here)
library(arrow)
# load data
server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet"))
date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet"))
plots <- NULL
```
```{r}
#| label: fig-data-timeline
#| fig-cap: "Cumulative counts of servers as their metadata appears in the quantitative data."
#| fig-asp: 0.3
library(here)
library(ggplot2)
library(arrow)
library(dplyr)
library(viridis)
server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet"))
date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet"))
# Process first appearances
server_appearance_dates <- server_first_last |>
rename(date = first_date) |>
count(date)
server_app_counts <- server_appearance_dates |>
arrange(date) |>
mutate(count = cumsum(n))
# Process last appearances
server_last_dates <- server_first_last |>
rename(date = last_date) |>
count(date)
server_last_counts <- server_last_dates |>
arrange(date) |>
mutate(count = cumsum(n))
# Process first rule appearances (excluding NA values)
server_rule_dates <- server_first_last |>
filter(!is.na(first_rule)) |>
rename(date = first_rule) |>
count(date)
server_rule_counts <- server_rule_dates |>
arrange(date) |>
mutate(count = cumsum(n))
# Create the plot
plots$data.timeline <- ggplot() +
# Add first appearances area
geom_area(
data = server_app_counts,
aes(x = date, y = count, fill = "Server First Seen"),
#alpha = 0.6
) +
# Add last appearances area
geom_area(
data = server_last_counts,
aes(x = date, y = count, fill = "Server Last Seen")
) +
# Add first rule appearances area
geom_area(
data = server_rule_counts,
aes(x = date, y = count, fill = "Rules Seen")
) +
# Use viridis color palette
scale_fill_viridis(
discrete = TRUE,
option = "G",
name = "Server Timeline",
alpha = 0.5,
breaks = c("Server First Seen", "Server Last Seen", "Rules Seen")
) +
# Add rug plot for dates without crawls
geom_rug(
data = subset(date_has_data, !crawled),
aes(x = date),
sides = "b",
length = unit(0.03, "npc"),
color = "#000000",
alpha = 0.5
) +
theme_minimal() +
labs(
#title = "Server Timeline: First Appearance, Last Seen, and Rules Added",
#x = "Date",
y = "Cumulative Count"
) +
theme(
#legend.position = "bottom",
axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
panel.grid.minor = element_blank(),
text = element_text(family = "sans")
)
plots$data.timeline
```
```{r}
#| label: fig-rules-pca
library(dplyr)
library(arrow)
library(here)
library(ggplot2)
library(tidyr)
exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
filter(redirect | short_lived)
rules_vectors <- arrow::read_parquet(
here::here("data/rules/processed/vectorized_rules_en.parquet"),
col_select = c("server", "text", "vector", "cluster", "representative")
) |>
filter(!(server %in% exclude_servers$server)) |>
dplyr::as_tibble()
hdbscan <- arrow::read_parquet(here::here("data/rules/processed/clusters_hdbscan.parquet"))
rules_vectors_has_clusters <- rules_vectors |>
tidyr::drop_na(cluster) |>
left_join(y = hdbscan, by = "cluster")
# First, convert the list column to a matrix
# Each row will be one vector
vectors_matrix <- do.call(rbind, rules_vectors_has_clusters$vector)
# Perform PCA
pca_result <- prcomp(vectors_matrix, center = TRUE, scale. = TRUE, rank = 2)
# Create dataframe with PCA results and clustering
pca_2d <- data.frame(
PC1 = pca_result$x[,1],
PC2 = pca_result$x[,2],
cluster = rules_vectors_has_clusters$cluster,
text = rules_vectors_has_clusters$text,
label = rules_vectors_has_clusters$label
)
plots$pca.clusters <- ggplot(
(pca_2d |> filter(cluster >= 0) |> filter(cluster < 10) |> mutate(cluster = as.factor(cluster))),
aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.5, size=0.1) +
theme_minimal() +
labs(title = "PCA of Vector Data") +
theme(legend.position="none")
plots$pca.clusters
```
```{r}
#| label: tbl-top-rules
#| tbl-cap: Most common rules on Fediverse servers.
#| cache: false # Cannot cache this due to tinytable/quarto limitations
library(arrow)
library(dplyr)
library(here)
library(tinytable)
library(stringr)
library(tidyr)
# TODO: we are missing string "No racism, sexism, homophobia, transphobia, xenophobia, or casteism": solution
# TODO: Handling of languages / multilingual
exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
filter(redirect | short_lived)
rules_vectors <- arrow::read_parquet(
here::here("data/rules/processed/vectorized_rules_en.parquet"),
col_select = c("server", "text", "cluster", "representative")
) |>
filter(!(server %in% exclude_servers$server)) |>
dplyr::as_tibble()
rule_clusters <- rules_vectors |> tidyr::drop_na(cluster)
most_common_rules <- rule_clusters |>
count(text) |>
arrange(desc(n))
top_rules <- inner_join(
most_common_rules,
(rules_vectors |> filter(cluster >= 0) |> distinct(text, cluster)),
by = "text"
) |> arrange(cluster, desc(n)) |>
distinct(cluster, .keep_all = TRUE) |>
select(text, cluster)
plots$top.rules <- inner_join(
top_rules,
(rule_clusters |> count(cluster) |> rename(Count = n)),
by = "cluster") |>
select(text, Count) |>
arrange(desc(Count)) |>
head(10) |>
rename(Rule = text) |>
tinytable::tt(width = c(0.75, 0.25)) |>
tinytable::style_tt(align = "lr")
plots$top.rules
```
```{r}
#| label: save-data
saveRDS(plots, here::here("data/processed/rules_plots.rds"))
```