Define system language and time
Sys.setenv(LANG = "en")
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
Library packages
invisible(
lapply(
c('LexisNexisTools', #Nexis data handling
'stringr', 'dplyr', 'broom', 'tidytext', 'tidyr', 'tidyverse', 'scales',
'reshape2','zoo', # df handling
'udpipe', 'spacyr', 'quanteda', #NLP
'EMT', # multinimial test
'stm', #topic modeling
'readxl', 'xlsx', #handling xlsx or cvs
'ggplot2', 'wordcloud', 'ggwordcloud', 'RColorBrewer', 'ggpubr',
'ggnewscale', 'treemap', 'ggh4x' #data visualization
),
require, character.only = T))
options(dplyr.summarise.inform = F) #silence for dplyr
System overview
xfun::session_info(dependencies = FALSE)
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
##
## Locale:
## LC_COLLATE=English_United States.utf8
## LC_CTYPE=English_United States.utf8
## LC_MONETARY=English_United States.utf8
## LC_NUMERIC=C
## LC_TIME=English_United States.1252
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## Package version:
## ggh4x_0.2.8 treemap_2.4-4
## ggnewscale_0.4.9 ggpubr_0.6.0
## ggwordcloud_0.5.0 wordcloud_2.6
## RColorBrewer_1.1-3 xlsx_0.6.5
## readxl_1.4.2 stm_1.3.6
## EMT_1.3 quanteda_3.3.1
## spacyr_1.2.1 udpipe_0.8.11
## zoo_1.8-12 reshape2_1.4.4
## scales_1.2.1 lubridate_1.9.2
## forcats_1.0.0 purrr_1.0.1
## readr_2.1.4 tibble_3.2.1
## ggplot2_3.4.2 tidyverse_2.0.0
## tidyr_1.3.0 tidytext_0.4.1
## broom_1.0.4 dplyr_1.1.2
## stringr_1.5.1 LexisNexisTools_0.3.7
## tidyselect_1.2.0 gridBase_0.4-7
## fastmap_1.1.1 janeaustenr_1.0.0
## promises_1.2.0.1 digest_0.6.31
## mime_0.12 timechange_0.2.0
## lifecycle_1.0.4 ellipsis_0.3.2
## tokenizers_0.3.0 magrittr_2.0.3
## compiler_4.3.2 rlang_1.1.1
## sass_0.4.6 tools_4.3.2
## igraph_1.5.0 utf8_1.2.3
## yaml_2.3.7 data.table_1.14.8
## knitr_1.43 ggsignif_0.6.4
## stopwords_2.3 plyr_1.8.8
## abind_1.4-5 withr_3.0.0
## grid_4.3.2 fansi_1.0.4
## xtable_1.8-4 colorspace_2.1-0
## cli_3.6.1 rmarkdown_2.22
## generics_0.1.3 RcppParallel_5.1.7
## stringdist_0.9.12 rstudioapi_0.14
## tzdb_0.4.0 pbapply_1.7-2
## cachem_1.0.8 parallel_4.3.2
## cellranger_1.1.0 vctrs_0.6.2
## Matrix_1.6-0 jsonlite_1.8.5
## carData_3.0-5 car_3.1-2
## hms_1.1.3 rstatix_0.7.2
## jquerylib_0.1.4 quanteda.textstats_0.96.2
## glue_1.6.2 stringi_1.7.12
## rJava_1.0-6 gtable_0.3.3
## later_1.3.1 munsell_0.5.0
## pillar_1.9.0 xlsxjars_0.6.1
## htmltools_0.5.5 R6_2.5.1
## shiny_1.7.4 evaluate_0.21
## lattice_0.21-9 png_0.1-8
## backports_1.4.1 SnowballC_0.7.1
## httpuv_1.6.11 bslib_0.4.2
## Rcpp_1.0.10 fastmatch_1.1-3
## nsyllable_1.0.1 xfun_0.39
## pkgconfig_2.0.3
Data was downloaded from https://advance.lexis.com. For detailed search query please refer to the paper.
# import data. this prompt may ask you to select a language
LNToutput <- lnt_read(x = paste0("./ScrapedData/Nexis"))
# convert raw data into paragraphs
meta_paragraphs_df <- lnt_convert(LNToutput, to = "data.frame", what = "Paragraphs")
doclist <- unique(meta_paragraphs_df$Art_ID)
classification_indices <- which(meta_paragraphs_df$Paragraph == 'Classification')
# restructure the data
metadata <- data.frame()
bodytext <- data.frame()
for (i in 1:length(doclist)) {
# targeting a single article
subset <- meta_paragraphs_df %>% filter(Art_ID == i)
# selecting and restructuring metadata
marker <- classification_indices[i]
start_row <- which(subset$Par_ID == marker)
temp <- subset[start_row:nrow(subset), ]
temp <- temp %>% select(Art_ID, Par_ID, Paragraph)
metadata <- bind_rows(metadata, temp)
# selecting and restructuring text sentences
temp1 <- subset[1:start_row-1, ]
bodytext <- bind_rows(bodytext, temp1)
rm(i, subset, marker, start_row, temp, temp1)
}
# fine-structuring the metadata
metadata$cate <- sapply(strsplit(metadata$Paragraph, ":"), function(x) x[1])
metadata$content <- sapply(strsplit(metadata$Paragraph, ":"), function(x) x[2]) %>%
trimws()
metadata <- metadata %>% select(Art_ID, cate, content) %>% drop_na()
metadata <- spread(metadata, key = cate, value = content)
# fine-filtering the bodytext
bodytext <- bodytext %>%
filter(Paragraph != "Graphic" & Paragraph != "" & Paragraph != " ")
extra <- bodytext %>% select(-Par_ID, -Paragraph) %>% unique()
# combine text and metadata
Nexisdata_df <- bodytext %>% group_by(Art_ID) %>%
summarize(Sentence = paste(Paragraph, collapse = " "))
Nexisdata_df <- left_join(Nexisdata_df, extra, by = "Art_ID")
Nexisdata_df <- left_join(Nexisdata_df, metadata, by = "Art_ID")
Nexisdata_df <- Nexisdata_df %>% filter(!endsWith(Source_File, "doclist.docx"))
Nexisdata_df <- Nexisdata_df %>%
select("Art_ID", "Headline", "Sentence", "Newspaper", "Date",
"Length", "Language","Subject", "Geographic", "Company",
"Industry", "Ticker", "Journal Code", "Section", "Edition",
"Author", "Graphic", "Publication-Type", "Source_File")
# call the data
Nexisdata_df <- readRDS("./ScrapedData/Nexisdata_all_raw.rds")
paste0("Number of articles from Nexis Uni: ", nrow(Nexisdata_df))
## [1] "Number of articles from Nexis Uni: 1361"
Filtering process 1
# remove duplicates in headlines
Nexisdata_df0 <- Nexisdata_df[!duplicated(Nexisdata_df [c("Headline")]), ]
# identify primary subject & region -- one item before the first parenthesis
Nexisdata_df0$mainsubject <- sapply(strsplit(Nexisdata_df0$Subject, ";"), function(x) x[1])
Nexisdata_df0$mainsubject <- gsub("\\(.*", "", Nexisdata_df0$mainsubject)
Nexisdata_df0$mainregion <- sapply(strsplit(Nexisdata_df0$Geographic, ";"), function(x) x[1])
Nexisdata_df0$mainregion <- gsub("\\(.*", "", Nexisdata_df0$mainregion)
# assign new order
Nexisdata_df0$order <- c(1:nrow(Nexisdata_df0))
# reorder the columns
Nexisdata_df0 <- Nexisdata_df0 %>%
select("order","Art_ID", "mainsubject", "mainregion", everything())
#Remove irrelevant section (this metadata is provided by News providers)
Nexisdata_df00 <- subset(Nexisdata_df0,
!grepl(paste(c("Sport", "SPORT"),
collapse = "|"), Section))
Nexisdata_df00 <- subset(Nexisdata_df00,
!grepl("SOCCER|CRICKET",mainsubject, ignore.case = TRUE))
# Remove irrelevant regions (this metadata is auto-generated by Nexis Uni)
Nexisdata_df00 <- subset(Nexisdata_df00,
grepl(paste(c("UNITED KINGDOM", "ENGLAND"),
collapse = "|"), mainregion, ignore.case = TRUE))
Nexisdata_df00 <- subset(Nexisdata_df00,
!grepl("NEW SOUTH WALES", mainregion, ignore.case = TRUE))
#Remove rows with no valid date metadata
Nexisdata_df00 <- Nexisdata_df00 %>% drop_na(Date)
Filtering process 2: Broadsheets, Tabloids, Printed, Online
monthly_count_Nexis_summary <- table(Nexisdata_df00$Newspaper) %>%
as.data.frame() %>% arrange(desc(Freq))
monthly_count_Nexis_summary <- monthly_count_Nexis_summary %>%
mutate(Tabloid = ifelse(str_detect(tolower(Var1),
'telegraph|independent|times|guardian|financial'), 0,
ifelse(str_detect(tolower(Var1), 'mail|mirror|sun'), 1, 9)))
monthly_count_Nexis_summary <- monthly_count_Nexis_summary %>% subset(!Tabloid == 9)
monthly_count_Nexis_summary <- monthly_count_Nexis_summary %>%
mutate(Online = ifelse(str_detect(tolower(Var1), '.co.uk|.com|online'), 1, 0))
Nexisdata_df000 <- Nexisdata_df00 %>%
separate(col = Date, into = c("year", "month", "daytime"), sep = "-")
Nexisdata_df000[, c("year","month")] <- sapply(Nexisdata_df000[, c("year","month")],
as.numeric)
Nexisdata_df1 <- Nexisdata_df000 %>%
dplyr::filter(Newspaper %in% monthly_count_Nexis_summary$Var1)
paste0("Number of valid articles after filtering: ", nrow(Nexisdata_df1))
## [1] "Number of valid articles after filtering: 836"
Year-monthly article count
data3_Nexis <- Nexisdata_df1 %>% group_by(year, month) %>% summarize(count = n())
monthly_count_Nexis <- data.frame(year = rep(2000:2023, each = 12), month = rep(1:12, 24))
monthly_count_Nexis <- left_join(monthly_count_Nexis, data3_Nexis)
monthly_count_Nexis[is.na(monthly_count_Nexis)] = 0
monthly_count_Nexis[, c("year", "month")] <- sapply(monthly_count_Nexis[, c("year", "month")], as.character)
monthly_count_Nexis <- monthly_count_Nexis %>%
mutate(ym = paste0(year, "-", month))
monthly_count_Nexis %>%
mutate(month = fct_relevel(month, unique(monthly_count_Nexis$month))) %>%
ggplot(aes(x=month, y=count)) +
geom_bar(stat='identity', fill="forestgreen") +
labs(title = "Number of articles about drought") +
theme_light()+
facet_wrap(~year, ncol=8) +
theme(strip.background =element_rect(fill="white")) +
theme(strip.text = element_text(colour = 'black'))
# Mean Central England Temperature (MCET)
#https://www.metoffice.gov.uk/hadobs/hadcet/data/meantemp_monthly_totals.txt
List_UK_temp <- read.xlsx(paste0("./ScrapedData/MET/Longterm_TP.xlsx"), sheetName = "MCET")
# Monthly England and Wales precipitation (MEWP)
#https://www.metoffice.gov.uk/hadobs/hadukp/data/monthly/HadEWP_monthly_totals.txt)
List_UK_prcp <- read.xlsx(paste0("./ScrapedData/MET/Longterm_TP.xlsx"), sheetName = "MEWP")
# calculate anomaly based on 1991-2020 average
List_UK_temp_subset_ref <- List_UK_temp %>% subset(Year %in% c(1991:2020)) %>%
select(-Year, -Annual)
UK_temp_ref_mean <- colMeans(List_UK_temp_subset_ref)
List_UK_prcp_subset_ref <- List_UK_prcp %>% subset(Year %in% c(1991:2020)) %>%
select(-Year, -Annual)
UK_prcp_ref_mean <- colMeans(List_UK_prcp_subset_ref)
# restructure data
List_UK_temp_subset_target <- List_UK_temp %>% subset(Year %in% c(2000:2023)) %>%
select(-Annual)
temp_target_anomaly <- sweep(List_UK_temp_subset_target[2:13], 2, UK_temp_ref_mean, "-")
temp_target_anomaly$Year <- List_UK_temp_subset_target$Year
temp_target_anomaly_long <- temp_target_anomaly %>%
pivot_longer(cols = -Year, names_to = "Month", values_to = "Value")
temp_target_anomaly_long$month <- rep(1:12, length(2000:2023))
temp_target_anomaly_long <- temp_target_anomaly_long %>% subset(Year %in% c(2000:2023))
List_UK_prcp_subset_target <- List_UK_prcp %>% subset(Year %in% c(2000:2023)) %>%
select(-Annual)
prcp_target_anomaly <- sweep(List_UK_prcp_subset_target[2:13], 2, UK_prcp_ref_mean, "-")
prcp_target_anomaly$Year <- List_UK_prcp_subset_target$Year
prcp_target_anomaly_long <- prcp_target_anomaly %>%
pivot_longer(cols = -Year, names_to = "Month", values_to = "Value")
prcp_target_anomaly_long$month <- rep(1:12, length(2000:2023))
prcp_target_anomaly_long <- prcp_target_anomaly_long %>% subset(Year %in% c(2000:2023))
# add temp and prcp to Nexis counts
monthly_count_Nexis$temp_anomaly <- temp_target_anomaly_long$Value
monthly_count_Nexis$prcp_anomaly <- prcp_target_anomaly_long$Value
# remove the month of 9,10,11,12 of 2023
monthly_count_Nexis <- head(monthly_count_Nexis, n = nrow(monthly_count_Nexis) - 4)
# create new values considering temporal lags
monthly_count_Nexis$temp_2mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly,width = 2,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_3mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly,width = 3,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_3sum <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 3,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis$temp_4mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 4,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_5mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 5,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_6mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 6,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$temp_12mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 12,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$prcp_2sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly,width = 2,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis$prcp_3mean <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 3,
FUN = mean, fill = 0, align = "right")
monthly_count_Nexis$prcp_3sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 3,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis$prcp_4sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 4,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis$prcp_5sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 5,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis$prcp_6sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 6,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis$prcp_12sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 12,
FUN = sum, fill = 0, align = "right")
monthly_count_Nexis %>%
mutate(zero = ifelse(count == 0, 0, 9)) %>%
ggplot(aes(x=temp_6mean, y=prcp_6sum)) +
geom_point(color = "black", stroke = 0.2, shape = 21,
aes(fill = factor(zero), size = count, alpha = count)) +
scale_fill_manual(values = c("white","darkblue")) +
scale_size_continuous(
range = c(0, 10)
) +
labs(
x = "Temp. Anomaly: 6month",
y = "Prcp. Anomaly: 6month",
title = paste0("Correlation plot: count of articles, n = ",
sum(monthly_count_Nexis$count))
) +
guides(fill = "none",
count = "none",
size = guide_legend(title = waiver())) +
theme_bw() +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
xlim(-3.5, 3.5) + ylim(-250, 250)
#H0: A categorical variable follows a hypothesized distribution.
#HA: A categorical variable does not follow the hypothesized distribution.
# when the p-value is less than the significance level (e.g. α = .05),
# we reject the null hypothesis
temp_default_qd1 <- monthly_count_Nexis %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd2 <- monthly_count_Nexis %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd3 <- monthly_count_Nexis %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd4 <- monthly_count_Nexis %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd <- c(length(unique(temp_default_qd1$ym)),
length(unique(temp_default_qd2$ym)),
length(unique(temp_default_qd3$ym)),
length(unique(temp_default_qd4$ym)))
temp_default_sum <- sum(temp_default_qd)
temp_default_qd <- (temp_default_qd)/temp_default_sum
temp_default_qd
## [1] 0.2616487 0.2007168 0.1648746 0.3727599
temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count),
sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1] 69 42 82 643
chisq.test(temp_observed_qd, p=temp_default_qd)
##
## Chi-squared test for given probabilities
##
## data: temp_observed_qd
## X-squared = 571.8, df = 3, p-value < 2.2e-16
1-month window
monthly_count_Nexis %>%
mutate(zero = ifelse(count == 0, 0, 9)) %>%
ggplot(aes(x=temp_anomaly, y=prcp_anomaly)) +
geom_point(color = "black", stroke = 0.2, shape = 21,
aes(fill = factor(zero), size = count, alpha = count)) +
scale_fill_manual(values = c("white","darkblue")) +
scale_size_continuous(
range = c(0, 10)
) +
labs(
x = "Temp. Anomaly: 1month",
y = "Prcp. Anomaly: 1month",
title = paste0("Correlation plot: count of articles, n = ",
sum(monthly_count_Nexis$count))
) +
guides(fill = "none",
count = "none",
size = guide_legend(title = waiver())) +
theme_bw() +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
xlim(-3.5, 3.5) + ylim(-250, 250)
3-month window
monthly_count_Nexis %>%
mutate(zero = ifelse(count == 0, 0, 9)) %>%
ggplot(aes(x=temp_3mean, y=prcp_3sum)) +
geom_point(color = "black", stroke = 0.2, shape = 21,
aes(fill = factor(zero), size = count, alpha = count)) +
scale_fill_manual(values = c("white","darkblue")) +
scale_size_continuous(
range = c(0, 10)
) +
labs(
x = "Temp. Anomaly: 3month",
y = "Prcp. Anomaly: 3month",
title = paste0("Correlation plot: count of articles, n = ",
sum(monthly_count_Nexis$count))
) +
guides(fill = "none",
count = "none",
size = guide_legend(title = waiver())) +
theme_bw() +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
xlim(-3.5, 3.5) + ylim(-250, 250)
12-month window
monthly_count_Nexis %>%
mutate(zero = ifelse(count == 0, 0, 9)) %>%
ggplot(aes(x=temp_12mean, y=prcp_12sum)) +
geom_point(color = "black", stroke = 0.2, shape = 21,
aes(fill = factor(zero), size = count, alpha = count)) +
scale_fill_manual(values = c("white","darkblue")) +
scale_size_continuous(
range = c(0, 10)
) +
labs(
x = "Temp. Anomaly: 12month",
y = "Prcp. Anomaly: 12month",
title = paste0("Correlation plot: count of articles, n = ",
sum(monthly_count_Nexis$count))
) +
guides(fill = "none",
count = "none",
size = guide_legend(title = waiver())) +
theme_bw() +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
xlim(-3.5, 3.5) + ylim(-250, 250)
monthly_count_Nexis <- monthly_count_Nexis %>%
mutate(season = if_else(month %in% c(9, 10, 11), "SON",
if_else(month %in% c(12, 1,2), "DJF",
if_else(month %in% c(3,4,5), "MAM",
if_else(month %in% c(6,7,8), "JJA", NA)))))
monthly_count_Nexis$season <- factor(monthly_count_Nexis$season,
levels=c("SON", "DJF", "MAM","JJA"))
monthly_count_Nexis %>%
mutate(zero = ifelse(count == 0, 0, 9)) %>%
ggplot(aes(x=temp_6mean, y=prcp_6sum, group=season)) +
geom_point(color = "black", stroke = 0.2, shape = 21,
aes(fill = factor(zero), size = count, alpha = count)) +
scale_fill_manual(values = c("white","darkblue")) +
labs(
x = "Temp. Anomaly: 6month",
y = "Prcp. Anomaly: 6month",
title = "Correlation plot: count of articles by seasonality",
subtitle = paste0("SON: ", sum(subset(monthly_count_Nexis, season == "SON")$count), ", ",
"DJF: ", sum(subset(monthly_count_Nexis, season == "DJF")$count), ", ",
"MAM: ", sum(subset(monthly_count_Nexis, season == "MAM")$count), ", ",
"JJA: ", sum(subset(monthly_count_Nexis, season == "JJA")$count))
) +
scale_size_continuous(
name = "count",
range = c(1, 10),
guide = guide_legend(
title.position = "top",
label.position = "right")
) +
facet_wrap(~season) +
guides(color = "none",
fill = "none",
size = guide_legend(title = waiver())) +
theme_bw() +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.3) +
xlim(-3.5, 3.5) + ylim(-250, 250)
SON
target_season <- "SON"
temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd <- c(length(unique(temp_default_qd1$ym)),
length(unique(temp_default_qd2$ym)),
length(unique(temp_default_qd3$ym)),
length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 12 17 10 30
temp_default_sum <- sum(temp_default_qd)
temp_default_qd <- (temp_default_qd)/temp_default_sum
temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count),
sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1] 5 5 5 40
chisq.test(temp_observed_qd, p=temp_default_qd)
##
## Chi-squared test for given probabilities
##
## data: temp_observed_qd
## X-squared = 19.504, df = 3, p-value = 0.000215
DJF
target_season <- "DJF"
temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd <- c(length(unique(temp_default_qd1$ym)),
length(unique(temp_default_qd2$ym)),
length(unique(temp_default_qd3$ym)),
length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 19 12 10 28
temp_default_sum <- sum(temp_default_qd)
temp_default_qd <- (temp_default_qd)/temp_default_sum
temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count),
sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1] 14 9 4 88
chisq.test(temp_observed_qd, p=temp_default_qd)
##
## Chi-squared test for given probabilities
##
## data: temp_observed_qd
## X-squared = 62.142, df = 3, p-value = 2.048e-13
MAM
target_season <- "MAM"
temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd <- c(length(unique(temp_default_qd1$ym)),
length(unique(temp_default_qd2$ym)),
length(unique(temp_default_qd3$ym)),
length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 22 12 13 22
temp_default_sum <- sum(temp_default_qd)
temp_default_qd <- (temp_default_qd)/temp_default_sum
temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count),
sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1] 30 11 48 206
chisq.test(temp_observed_qd, p=temp_default_qd)
##
## Chi-squared test for given probabilities
##
## data: temp_observed_qd
## X-squared = 209.55, df = 3, p-value < 2.2e-16
JJA
target_season <- "JJA"
temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 )
temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 )
temp_default_qd <- c(length(unique(temp_default_qd1$ym)),
length(unique(temp_default_qd2$ym)),
length(unique(temp_default_qd3$ym)),
length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 20 15 13 24
temp_default_sum <- sum(temp_default_qd)
temp_default_qd <- (temp_default_qd)/temp_default_sum
temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count),
sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1] 20 17 25 309
chisq.test(temp_observed_qd, p=temp_default_qd)
##
## Chi-squared test for given probabilities
##
## data: temp_observed_qd
## X-squared = 418.03, df = 3, p-value < 2.2e-16
Archive and remove data
rm(target_season, temp_default_qd1, temp_default_qd2, temp_default_qd3, temp_default_qd4,
temp_default_qd, temp_default_sum)
write.xlsx(monthly_count_Nexis,
file = paste0("./ProcessedData/Nexisdata_summary.xlsx"))
winsorize <- function(x, p = c(0.01, 0.99)) {
q <- quantile(x, probs = p, na.rm = TRUE)
x[x < q[1]] <- q[1]
x[x > q[2]] <- q[2]
x
}
monthly_count_Nexis$ym <- as.yearmon(monthly_count_Nexis$ym)
monthly_count_Nexis$ym <- as.character(monthly_count_Nexis$ym)
monthly_count_Nexis$year <- as.numeric(monthly_count_Nexis$year)
monthly_count_Nexis$month <- as.numeric(monthly_count_Nexis$month)
monthly_count_Nexis_0 <- data.frame("ym" = monthly_count_Nexis$ym,
"year" = monthly_count_Nexis$year,
"month" = monthly_count_Nexis$month,
"count" = monthly_count_Nexis$count,
"source" = "count.article")
monthly_count_Nexis_0$count_rescale2 <- rescale(winsorize(monthly_count_Nexis_0$count))
# 6-month mean temp
monthly_count_Nexis_T <- data.frame("ym" = monthly_count_Nexis$ym,
"year" = monthly_count_Nexis$year,
"month" = monthly_count_Nexis$month,
"count" = round(monthly_count_Nexis$temp_6mean, digits = 1),
"source" = "temp.6mo.avg")
monthly_count_Nexis_T$count_rescale2 <- rescale(monthly_count_Nexis_T$count)
# 6-month sum prcp
monthly_count_Nexis_P <- data.frame("ym" = monthly_count_Nexis$ym,
"year" = monthly_count_Nexis$year,
"month" = monthly_count_Nexis$month,
"count" = round(monthly_count_Nexis$prcp_6sum, digits = 0),
"source" = "prcp.6mo.sum")
monthly_count_Nexis_P$count_rescale2 <- rescale(monthly_count_Nexis_P$count)
temp_stat <- cor.test(monthly_count_Nexis$count, round(monthly_count_Nexis$temp_6mean, 3))
paste0("Temperature in 6-month window: coefficient ", round(temp_stat$estimate,3),
", p-value ", round(temp_stat$p.value, 3))
## [1] "Temperature in 6-month window: coefficient 0.146, p-value 0.014"
prcp_stat <- cor.test(monthly_count_Nexis$count, round(monthly_count_Nexis$prcp_6sum, 3))
paste0("Precipitation in 6-month window: coefficient ", round(prcp_stat$estimate,3),
", p-value ", round(prcp_stat$p.value, 3))
## [1] "Precipitation in 6-month window: coefficient -0.206, p-value 0"
Plot the anomaly trend
plot_weather2 <- monthly_count_Nexis %>%
mutate(ym = factor(ym, levels=ym)) %>%
mutate(temp_6mean_zscore = as.vector(scale(temp_6mean))) %>%
mutate(prcp_6sum_zscore = as.vector(scale(prcp_6sum))) %>%
mutate(order = 1:n()) %>%
mutate(both_conditions_met = as.vector(temp_6mean_zscore > 0 & prcp_6sum_zscore < 0)) %>%
ggplot(aes(x = order)) +
geom_line(aes(y = temp_6mean_zscore, color = "temp_6mean_zscore"), group = 1) +
geom_line(aes(y = prcp_6sum_zscore, color = "prcp_6sum_zscore"), group = 2) +
scale_color_manual(values = c("#2c7bb6", "#d7191c"),
labels = c("Prcp anomaly: 6 month sum", "Temp anomaly: 6 month average")) +
labs(x = " ", y = "Z-score") +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
ylim(-4, 4) +
theme_bw() +
# Apply conditional coloring
geom_rect(aes(xmin = order - 0.5, xmax = order + 0.5, ymin = -Inf, ymax = Inf,
fill = both_conditions_met), alpha = 0.5) +
scale_fill_manual(values = c(`TRUE` = "gold", `FALSE` = "white"), guide = FALSE) +
scale_x_continuous(
expand = c(0,0),
breaks = seq(1, nrow(monthly_count_Nexis), by = 12),
labels = monthly_count_Nexis$ym[seq(1, nrow(monthly_count_Nexis), by = 12)]
) +
guides(color = guide_legend(
title = NULL,
title.position = "top",
keywidth = 1,
keyheight = 1
)) +
theme(
panel.background = element_blank(),
panel.border = element_blank(),
axis.text.x = element_text(angle = 30, vjust = 0.5, hjust=0.5),
legend.background = element_blank(),
legend.title = element_blank(),
plot.background = element_blank()
)
plot_media2 <- monthly_count_Nexis %>%
mutate(ym = factor(ym, levels=ym)) %>%
ggplot(aes(x = ym, y = count, fill = "count")) +
geom_bar(stat = "identity") +
scale_x_discrete(breaks = function(x) x[seq(1, length(x), by = 12)]) +
scale_fill_manual(values = "darkblue",
labels = "Article count: 1 month") +
theme_bw() +
labs(
x = " ",
y = "Count") +
theme(axis.text.x = element_blank(),
legend.title = element_blank(),
legend.background = element_blank())
ggpubr::ggarrange(plot_media2, plot_weather2,
ncol = 1, nrow = 2, font.label = list(size = 12),
align = "v")
ggplot() +
# temp6mean
geom_point(
data = monthly_count_Nexis_T,
aes(month, source, color=count, size = 6)) +
scale_color_gradient2(low = "white", mid = "white", high = "darkorange") +
geom_text(data = monthly_count_Nexis_T,
aes(month, source, label = count),
color = "black", size = 2) +
# prcp6sum
new_scale_color() +
geom_point(
data = monthly_count_Nexis_P,
aes(month, source, color=count, size = 6)) +
scale_color_gradient2(low = "brown", mid = "white", high = "white") +
geom_text(data = monthly_count_Nexis_P,
aes(month, source, label = count),
color = "black", size = 2) +
# article count
new_scale_fill() +
geom_tile(
data = monthly_count_Nexis_0,
aes(month, source, fill=count_rescale2)
) +
scale_fill_gradient(low = "white", high = "darkblue") +
geom_text(data = monthly_count_Nexis_0,
aes(month, source, label = count),
color = ifelse(monthly_count_Nexis_0$count > 30,
"white", "black"), size = 2) +
facet_wrap(~year) +
theme_minimal() +
labs(x = " ", y = " ", fill = " ") +
scale_x_continuous(breaks = 1:12, labels = month,
position = "top",
expand = c(0,0)) +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.placement = "outside",
panel.spacing.y = unit(0.5, "lines"),
panel.spacing.x = unit(0.5, "lines"),
strip.text.x = element_text(size = 8, face = "bold"),
axis.text.x = element_text(size = 6, face = "bold.italic")
)
STM tutorial by Roberts et al., 2019 (DOI: https://doi.org/10.18637/jss.v091.i02)
Subset data for target year
reference <- data.frame("year" = rep(c(2012, 2022), each = 6),
"month" = c(1,2,3,4,5,6, # for 2012
7,8,9,10,11,12)) # for 2022
datarange <- list("2012" = "Jan to Jun, 2012",
"2022" = "July to Dec, 2022")
data_input <- Nexisdata_df1
Initiate the NLP model
#spacy_download_langmodel(model = "en_core_web_trf")
spacy_initialize(model = "en_core_web_trf")
Proceed with NLP
# NLP. target text includes title, subtitle, and body text
spacy_0 <- spacy_parse(paste0(data_input$Headline, ". ", data_input$Sentence),
dependency = T, nounphrase = TRUE)
# extract named entities
data_entity <- entity_extract(spacy_0, type = "named", concatenator = "_")
data_entity$ngram <- sapply(strsplit(data_entity$entity, "_"), length)
data_entity_consolidate <- udpipe::txt_recode_ngram(spacy_0$token,
compound = data_entity$entity,
ngram = data_entity$ngram,
sep = "_") %>% as.data.frame()
colnames(data_entity_consolidate) <- "entity_result"
spacy_11 <- cbind(spacy_0, data_entity_consolidate)
# consolidate selected NERs. we keep "named" entities
spacy_11$entity[spacy_11$entity %in%
c("DATE_B", "DATE_I", "LANGUAGE_B", "LANGUAGE_I", "QUANTITY_I",
"QUANTITY_B", "CARDINAL_I", "CARDINAL_B", "ORDINAL_B",
"ORDINAL_I", "TIME_B", "TIME_I", "MONEY_I", "MONEY_B",
"PERCENT_B", "PERCENT_I")] <- ""
spacy_11$final <- with(spacy_11, ifelse(entity == "", lemma, entity_result))
spacy_11 <- spacy_11 %>% drop_na()
# assign a new document-sentence ID
spacy_11$docsen <- paste0(spacy_11$doc_id, "_", spacy_11$sentence_id)
# stop-word removal. sw lexicon here is the same as snowball from 'stopword' R package
sw <- quanteda::stopwords("english") %>% data.frame()
colnames(sw) <- "final"
spacy_2 <- spacy_11 %>% dplyr::anti_join(sw, by = "final")
# more data cleaning
spacy_2 <- filter(spacy_2, spacy_2$pos != "PUNCT") #punctuation
spacy_2 <- filter(spacy_2, nchar(spacy_2$final) != 1) # single text removal
spacy_2 <- spacy_2[!(spacy_2$final %in%
c("I", "’", "-", "•", "'", "/", "’s", "'s", "’re", "’ve", "n't")),]
# save the dataframes
saveRDS(spacy_11, paste0("./ProcessedData/N_drought+england_ALL_spacy1.rds"))
saveRDS(spacy_2, paste0("./ProcessedData/N_drought+england_ALL_spacy2.rds"))
spacy_2 <- readRDS(paste0("./ProcessedData/N_drought+england_ALL_spacy2.rds"))
# filter nouns and return tokens into sentences
x <- spacy_2 %>%
filter(pos %in% c("NOUN")) %>%
group_by(doc_id) %>%
mutate(text_string = paste(final, collapse = " ")) %>%
select(doc_id, text_string) %>% unique()
# label the source of news article: newspaper or tabloid?
temp <- data_input %>% select(Art_ID, year, month, Newspaper)
temp <- temp %>%
mutate(source = ifelse(str_detect(tolower(Newspaper),
'telegraph|independent|times|guardian|financial'),
"broadsheet",
ifelse(str_detect(tolower(Newspaper),
'mail|mirror|sun'), "tabloid", "NA")))
temp$doc_id <- paste0("text", c(1:nrow(temp)))
# merge two above dataset
x <- left_join(x, temp, by = "doc_id")
# tokenize text strings
x_token <- x$text_string %>%
tokens(what = "fastestword") # this line works the same as: stri_split_fixed(x$text_string, " ")
# create a data frame optimal for topic modeling
x_dfm <- dfm(x_token, tolower = F)
x_stm <- convert(x_dfm, to = "stm")
this part is optional the final plot may give different results every run. to learn more, try ?searchK()
# give the k to test the optimal number of topics
K1 <- c(4, 6, 8, 10, 12, 15, 20, 30)
# calculate semantic coherence, exclusivity, heldout, and residual
set.seed(123)
x_fit1 <- searchK(x_stm$documents, x_stm$vocab, K = K1, verbose = F)
x_plot1 <- data.frame("K" = K1,
"Coherence" = unlist(x_fit1$results$semcoh),
"Exclusivity" = unlist(x_fit1$results$exclus),
"heldout" = unlist(x_fit1$results$heldout),
"residual" = unlist(x_fit1$results$residual))
# plot the result
x_plot1 <- reshape2::melt(x_plot1, id = c("K"))
ggplot(x_plot1, aes(K, value, color = variable, group = 1)) +
geom_point(show.legend = FALSE) +
geom_line(show.legend = FALSE) + #size = 0.5,
scale_x_continuous(breaks = unique(x_plot1$K)) +
facet_wrap(~variable, scales = "free_y") +
theme_bw() +
labs(x = "Number of topics K",
title = "Statistical fit of models with different K")
optimalK <- 10
set.seed(123)
x_model <- stm(documents = x_stm$documents,
vocab = x_stm$vocab,
K = as.numeric(optimalK),
data = x,
verbose = F)
plot.STM(x_model, type = "summary")
Filter keywords per topic
# filter top 50 words
x_topic <- labelTopics(x_model, n = 50)
x_topic_prob <- data.frame("features" = t(x_topic$prob))
colnames(x_topic_prob) <- paste("Topic", c(1:as.numeric(optimalK)))
x_topic_frex <- data.frame("features" = t(x_topic$frex))
colnames(x_topic_frex) <- paste("Topic", c(1:as.numeric(optimalK)))
# example of probability-based keywords
x_topic_prob %>% head(10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
## 1 water farmer water water water rain year
## 2 drought crop plant year company temperature drought
## 3 river year garden climate year week time
## 4 level food tree reservoir drought weather thing
## 5 rainfall price drought river supply day day
## 6 company drought lawn change day flood school
## 7 area weather summer drought litre water people
## 8 ban condition ban government leak drought rain
## 9 hosepipe potato hosepipe wildlife order flooding goal
## 10 year vegetable butt plan use area garden
## Topic 8 Topic 9 Topic 10
## 1 water year rain
## 2 ban cent year
## 3 hosepipe per rainfall
## 4 customer home month
## 5 drought drought weather
## 6 company flood per
## 7 temperature property cent
## 8 day village drought
## 9 area flooding record
## 10 week climate water
x_topic_frex %>% head(10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## 1 agency barley butterfly beaver leak hail
## 2 river potato robinia wetland meter flash
## 3 flow wheat leaf vole target thunderstorm
## 4 status harvest bluebell toad order tornado
## 5 groundwater livestock leave lapwing desalination wind
## 6 environment crop pseudoacacia bird bill downpour
## 7 restriction grower berry habitat litre flooding
## 8 permit yield ivy population metering alert
## 9 level farmer lawn abstraction pipe lightning
## 10 drought carrot pot specie leakage blaze
## Topic 7 Topic 8 Topic 9 Topic 10
## 1 lock customer subsidence mm
## 2 cave pool claim rain
## 3 prairie health insurer average
## 4 trophy fire council snow
## 5 goal hosepipe barge rainfall
## 6 show ban village inch
## 7 school bottle beach month
## 8 game wildfire defence pollen
## 9 defender heatwave ship aquifer
## 10 housewife fine map figure
Determine the dominant topic for articles
# assign topical probability to each document
theta <- x_model[["theta"]] %>% as.data.frame()
# what is the maximum probability?
theta$max <- apply(theta, 1, max, na.rm=TRUE)
theta$temp_topic <- colnames(theta)[apply(theta,1,which.max)]
# does the max prob exceed the 3*(1/optimalK[y0])?
theta$final_topic <- colnames(theta)[apply(theta[, c(1:as.numeric(optimalK))],1,which.max)]
theta$final_topic[which(theta$max < 3*(1/as.numeric(optimalK)))] <- NA #threshold to be 3*(1/optimalK[y0])
#add relevant metadata
theta$year <- x$year
theta$month <- x$month
theta$Art_ID <- x$Art_ID
theta$source <- x$source
# color palette function
get_Palette <- colorRampPalette(brewer.pal(12, "Set3"))
colorpalette <- get_Palette(as.numeric(optimalK))
# elements for legend
topic_label <- c()
for (k1 in c(1:optimalK)) {
temp <- paste0(k1, ". ", x_topic_prob[1,k1],
", ", x_topic_prob[2,k1],
", ", x_topic_prob[3,k1])
topic_label <- c(topic_label, temp)
rm(temp, k1)
}
temp_order <- make.dt(x_model)
temp_order <- summarize_all(temp_order, mean)
temp_order <- temp_order[, -c(1)]
temp_order <- unlist(temp_order)
temp_order <- order(temp_order, decreasing = TRUE)
topic_label <- topic_label[temp_order] # the order follows the result from plot.STM()
topic_level <- paste0("V", temp_order) # the order follows the result from plot.STM()
topic_label_new <- c("#1.Drought status", "#2.Agriculture", "#3.Gardening",
"#4.Aquatic ecosystem","#5.Water company", "#6.Flooding",
"#7.N/A", "#8.Hosepipe ban", "#9.N/A", "#10.Rain forecast")
topic_label_new <- topic_label_new[temp_order]
Generate a summary file
theta_summary3 <- theta %>%
complete(year, month, final_topic, fill = list(z = NA)) %>%
group_by(year, month, final_topic, .drop = F) %>%
summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>%
group_by(month) %>%
mutate(ratio = count/sum(count)) %>%
drop_na() %>%
mutate(ym = paste0(year, "-", month))
theta_summary3$month <- factor(theta_summary3$month)
theta_summary3 <- theta_summary3 %>% as.data.frame()
count_bymonth3 <- aggregate(count ~ ym, data = theta_summary3, FUN = sum) %>%
select(count) %>% as.vector()
Plot by count
ggplot(theta_summary3,
aes(x = month, y = count, fill = factor(final_topic, levels = topic_level))) +
geom_bar(stat = "identity", position = position_stack(reverse = T)) +
theme_bw() +
xlab(" ") +
ylab("count")+
facet_wrap(~year, ncol=5) +
scale_fill_manual(label = topic_label, values = colorpalette, na.value = "gray70") +
theme(legend.position = "right", legend.title = element_blank(),
plot.title = element_text(size=12, face = "bold"),
plot.subtitle =element_text(size=10, face = "italic"))
Set a target year
y <- 2012
y0 <- "2012"
theta_summary2 <- theta %>%
dplyr::filter(year == y) %>%
dplyr::filter(month %in% reference$month[reference$year == y]) %>%
complete(month, final_topic, fill = list(z = NA)) %>%
group_by(month, final_topic, .drop = F) %>%
summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>%
group_by(month) %>%
mutate(ratio = count/sum(count)) %>%
drop_na() %>%
as.data.frame()
count_bymonth <- aggregate(count ~ month, data = theta_summary2, FUN = sum) %>%
select(count) %>% as.vector()
Plot by count – treemap
temp_palette <- data.frame(final_topic = topic_level,
colors = colorpalette,
topic_label0 = topic_label_new)
theta_summary2 <- left_join(theta_summary2, temp_palette, by = "final_topic")
theta_summary2 <- theta_summary2 %>%
mutate(month_abb = month.abb[month])
treemap(theta_summary2,
index = c("month_abb", "topic_label0"),
vSize = "count",
type = "color",
vColor = "colors",
fontface.labels=c(2,1),
bg.labels=c("transparent"),
align.labels=list(
c("left", "top"),
c("center", "center")
),
fontsize.labels=c(30,18),
fontsize.title = 18,
border.col=c("black","white"),
title = paste0(datarange[y0],
". articles per month = ", count_bymonth)
)
Set a target year
y <- 2022
y0 <- "2022"
theta_summary2 <- theta %>%
dplyr::filter(year == y) %>%
dplyr::filter(month %in% reference$month[reference$year == y]) %>%
complete(month, final_topic, fill = list(z = NA)) %>%
group_by(month, final_topic, .drop = F) %>%
summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>%
group_by(month) %>%
mutate(ratio = count/sum(count)) %>%
drop_na() %>%
as.data.frame()
count_bymonth <- aggregate(count ~ month, data = theta_summary2, FUN = sum) %>%
select(count) %>% as.vector()
Plot by count – treemap
temp_palette <- data.frame(final_topic = topic_level,
colors = colorpalette,
topic_label0 = topic_label_new)
theta_summary2 <- left_join(theta_summary2, temp_palette, by = "final_topic")
theta_summary2 <- theta_summary2 %>%
mutate(month_abb = month.abb[month])
treemap(theta_summary2,
index = c("month_abb", "topic_label0"),
vSize = "count",
type = "color",
vColor = "colors",
fontface.labels=c(2,1),
bg.labels=c("transparent"),
align.labels=list(
c("left", "top"),
c("center", "center")
),
fontsize.labels=c(30,18),
fontsize.title = 18,
border.col=c("black","white"),
title = paste0(datarange[y0],
". articles per month = ", count_bymonth)
)
spacy_11 <- readRDS(paste0("./ProcessedData/N_drought+england_ALL_spacy1.rds"))
topic_subset <- bind_cols(x, theta)
temp_all <- data.frame()
for (t in c(1, 10, 5,6,8,4,2,3,7,9)) {
topic_subset0 <- topic_subset %>% filter(final_topic %in% paste0("V", t))
spacy_1_topicsubset <- spacy_11 %>% subset(doc_id %in% topic_subset0$doc_id)
# entity
spacy_1_topic_entity <- spacy_1_topicsubset[!(spacy_1_topicsubset$entity==""), ]
## organization
spacy_1_topic_org <- spacy_1_topic_entity %>%
subset(entity %in% c("ORG_B")) %>% count(entity_result, sort = TRUE)
temp <- data.frame("topic" = t,
"org" = head(spacy_1_topic_org, n = 10))
temp_all <- rbind(temp_all, temp)
# remove byproducts
rm(spacy_1_topicsubset, spacy_1_topic_entity,
spacy_1_topic_org, temp)
}
temp_all
## topic org.entity_result org.n
## 1 1 the_Environment_Agency 274
## 2 1 The_Environment_Agency 122
## 3 1 Thames_Water 119
## 4 1 EA 84
## 5 1 Southern_Water 73
## 6 1 Environment_Agency 62
## 7 1 South_East_Water 59
## 8 1 Anglian_Water 56
## 9 1 Government 48
## 10 1 the_Met_Office 44
## 11 10 the_Environment_Agency 47
## 12 10 the_Met_Office 47
## 13 10 Met_Office 35
## 14 10 The_Environment_Agency 27
## 15 10 Thames_Water 26
## 16 10 Southern_Water 19
## 17 10 The_Met_Office 19
## 18 10 United_Utilities 13
## 19 10 the_Centre_for_Ecology_and_Hydrology 12
## 20 10 Water_UK 11
## 21 5 Thames_Water 138
## 22 5 Ofwat 116
## 23 5 the_Environment_Agency 53
## 24 5 Southern_Water 44
## 25 5 South_East_Water 38
## 26 5 The_Environment_Agency 36
## 27 5 Government 30
## 28 5 Thames 26
## 29 5 United_Utilities 23
## 30 5 Water_UK 23
## 31 6 the_Environment_Agency 79
## 32 6 the_Met_Office 73
## 33 6 The_Met_Office 64
## 34 6 The_Environment_Agency 50
## 35 6 Met_Office 37
## 36 6 Thames_Water 35
## 37 6 EA 32
## 38 6 Southern_Water 29
## 39 6 Environment_Agency 18
## 40 6 Yorkshire_Water 15
## 41 8 Thames_Water 154
## 42 8 Southern_Water 82
## 43 8 the_Environment_Agency 69
## 44 8 South_East_Water 63
## 45 8 Government 45
## 46 8 the_Met_Office 44
## 47 8 Yorkshire_Water 39
## 48 8 Welsh_Water 38
## 49 8 Met_Office 37
## 50 8 the_National_Drought_Group 33
## 51 4 the_Environment_Agency 41
## 52 4 Thames_Water 38
## 53 4 Southern_Water 18
## 54 4 Natural_England 17
## 55 4 RSPB 17
## 56 4 Ofwat 13
## 57 4 The_Environment_Agency 11
## 58 4 WWF 11
## 59 4 Labour 10
## 60 4 Defra 9
## 61 2 NFU 38
## 62 2 the_Environment_Agency 19
## 63 2 EU 12
## 64 2 the_National_Farmers_'_Union 10
## 65 2 Government 7
## 66 2 Lidl 7
## 67 2 Shell 7
## 68 2 the_National_Drought_Group 7
## 69 2 Bank 6
## 70 2 Guardian 6
## 71 3 Thames_Water 11
## 72 3 Southern_Water 10
## 73 3 the_Environment_Agency 10
## 74 3 South_East_Water 8
## 75 3 the_Met_Office 8
## 76 3 RHS 6
## 77 3 Anglian_Water 5
## 78 3 Hozelock 5
## 79 3 Veolia_Central 4
## 80 3 Veolia_South_East 4
## 81 7 Chelsea 16
## 82 7 RHS 15
## 83 7 Arsenal 12
## 84 7 Government 10
## 85 7 Thames_Water 10
## 86 7 Tottenham 8
## 87 7 IMF 7
## 88 7 Cabinet 6
## 89 7 City 6
## 90 7 West_Brom 6
## 91 9 the_Environment_Agency 11
## 92 9 The_Independent 10
## 93 9 Thames_Water 9
## 94 9 The_Environment_Agency 9
## 95 9 BGS 8
## 96 9 the_Met_Office 8
## 97 9 EU 7
## 98 9 Government 7
## 99 9 Southern_Water 7
## 100 9 HSBC 6