0. Setup for system and packages

Define system language and time

Sys.setenv(LANG = "en")
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"

Library packages

invisible(
  lapply(
    c('LexisNexisTools', #Nexis data handling 
      
      'stringr', 'dplyr', 'broom', 'tidytext', 'tidyr', 'tidyverse', 'scales', 
      'reshape2','zoo', # df handling 
      
      'udpipe', 'spacyr', 'quanteda', #NLP
      
      'EMT', # multinimial test 
      
      'stm', #topic modeling 
      
      'readxl', 'xlsx', #handling xlsx or cvs 
      
      'ggplot2', 'wordcloud', 'ggwordcloud', 'RColorBrewer', 'ggpubr', 
      'ggnewscale', 'treemap', 'ggh4x' #data visualization 
      ),
       require, character.only = T))

options(dplyr.summarise.inform = F) #silence for dplyr

System overview

xfun::session_info(dependencies = FALSE)
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## 
## Locale:
##   LC_COLLATE=English_United States.utf8 
##   LC_CTYPE=English_United States.utf8   
##   LC_MONETARY=English_United States.utf8
##   LC_NUMERIC=C                          
##   LC_TIME=English_United States.1252    
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## Package version:
##   ggh4x_0.2.8               treemap_2.4-4            
##   ggnewscale_0.4.9          ggpubr_0.6.0             
##   ggwordcloud_0.5.0         wordcloud_2.6            
##   RColorBrewer_1.1-3        xlsx_0.6.5               
##   readxl_1.4.2              stm_1.3.6                
##   EMT_1.3                   quanteda_3.3.1           
##   spacyr_1.2.1              udpipe_0.8.11            
##   zoo_1.8-12                reshape2_1.4.4           
##   scales_1.2.1              lubridate_1.9.2          
##   forcats_1.0.0             purrr_1.0.1              
##   readr_2.1.4               tibble_3.2.1             
##   ggplot2_3.4.2             tidyverse_2.0.0          
##   tidyr_1.3.0               tidytext_0.4.1           
##   broom_1.0.4               dplyr_1.1.2              
##   stringr_1.5.1             LexisNexisTools_0.3.7    
##   tidyselect_1.2.0          gridBase_0.4-7           
##   fastmap_1.1.1             janeaustenr_1.0.0        
##   promises_1.2.0.1          digest_0.6.31            
##   mime_0.12                 timechange_0.2.0         
##   lifecycle_1.0.4           ellipsis_0.3.2           
##   tokenizers_0.3.0          magrittr_2.0.3           
##   compiler_4.3.2            rlang_1.1.1              
##   sass_0.4.6                tools_4.3.2              
##   igraph_1.5.0              utf8_1.2.3               
##   yaml_2.3.7                data.table_1.14.8        
##   knitr_1.43                ggsignif_0.6.4           
##   stopwords_2.3             plyr_1.8.8               
##   abind_1.4-5               withr_3.0.0              
##   grid_4.3.2                fansi_1.0.4              
##   xtable_1.8-4              colorspace_2.1-0         
##   cli_3.6.1                 rmarkdown_2.22           
##   generics_0.1.3            RcppParallel_5.1.7       
##   stringdist_0.9.12         rstudioapi_0.14          
##   tzdb_0.4.0                pbapply_1.7-2            
##   cachem_1.0.8              parallel_4.3.2           
##   cellranger_1.1.0          vctrs_0.6.2              
##   Matrix_1.6-0              jsonlite_1.8.5           
##   carData_3.0-5             car_3.1-2                
##   hms_1.1.3                 rstatix_0.7.2            
##   jquerylib_0.1.4           quanteda.textstats_0.96.2
##   glue_1.6.2                stringi_1.7.12           
##   rJava_1.0-6               gtable_0.3.3             
##   later_1.3.1               munsell_0.5.0            
##   pillar_1.9.0              xlsxjars_0.6.1           
##   htmltools_0.5.5           R6_2.5.1                 
##   shiny_1.7.4               evaluate_0.21            
##   lattice_0.21-9            png_0.1-8                
##   backports_1.4.1           SnowballC_0.7.1          
##   httpuv_1.6.11             bslib_0.4.2              
##   Rcpp_1.0.10               fastmatch_1.1-3          
##   nsyllable_1.0.1           xfun_0.39                
##   pkgconfig_2.0.3

1 Corpus building

1.1 Download data from Nexis Uni

Data was downloaded from https://advance.lexis.com. For detailed search query please refer to the paper.

1.2 Import data as a R data frame

# import data. this prompt may ask you to select a language
LNToutput <- lnt_read(x = paste0("./ScrapedData/Nexis")) 


# convert raw data into paragraphs 
meta_paragraphs_df <- lnt_convert(LNToutput, to = "data.frame", what = "Paragraphs")

doclist <- unique(meta_paragraphs_df$Art_ID)

classification_indices <- which(meta_paragraphs_df$Paragraph == 'Classification')


# restructure the data 
metadata <- data.frame()

bodytext <- data.frame()

for (i in 1:length(doclist)) {
  
  # targeting a single article 
  subset <- meta_paragraphs_df %>% filter(Art_ID == i) 
  
  # selecting and restructuring metadata
  marker <- classification_indices[i] 
  
  start_row <- which(subset$Par_ID == marker)
  
  temp <- subset[start_row:nrow(subset), ]
  temp <- temp %>% select(Art_ID, Par_ID, Paragraph)
  
  metadata <- bind_rows(metadata, temp)
  
  # selecting and restructuring text sentences 
  temp1 <- subset[1:start_row-1, ]
  bodytext <- bind_rows(bodytext, temp1)
  
  rm(i, subset, marker, start_row, temp, temp1)
  
}


# fine-structuring the metadata
metadata$cate <- sapply(strsplit(metadata$Paragraph, ":"), function(x) x[1])

metadata$content <- sapply(strsplit(metadata$Paragraph, ":"), function(x) x[2]) %>% 
  trimws()

metadata <- metadata %>% select(Art_ID, cate, content) %>% drop_na()

metadata <- spread(metadata, key = cate, value = content)


# fine-filtering the bodytext 
bodytext <- bodytext %>%
  filter(Paragraph != "Graphic" & Paragraph != "" & Paragraph != " ")

extra <- bodytext %>% select(-Par_ID, -Paragraph) %>% unique()


# combine text and metadata 
Nexisdata_df <- bodytext %>% group_by(Art_ID) %>%
  summarize(Sentence = paste(Paragraph, collapse = " "))

Nexisdata_df <- left_join(Nexisdata_df, extra, by = "Art_ID")

Nexisdata_df <- left_join(Nexisdata_df, metadata, by = "Art_ID")

Nexisdata_df <- Nexisdata_df %>% filter(!endsWith(Source_File, "doclist.docx")) 

Nexisdata_df <- Nexisdata_df %>%
  select("Art_ID", "Headline", "Sentence", "Newspaper", "Date", 
         "Length", "Language","Subject", "Geographic", "Company", 
         "Industry", "Ticker", "Journal Code", "Section", "Edition", 
         "Author", "Graphic", "Publication-Type", "Source_File")

1.3 Clean the data

# call the data 
Nexisdata_df <- readRDS("./ScrapedData/Nexisdata_all_raw.rds")

paste0("Number of articles from Nexis Uni: ", nrow(Nexisdata_df))
## [1] "Number of articles from Nexis Uni: 1361"

Filtering process 1

# remove duplicates in headlines 
Nexisdata_df0 <- Nexisdata_df[!duplicated(Nexisdata_df [c("Headline")]), ]


# identify primary subject & region -- one item before the first parenthesis  
Nexisdata_df0$mainsubject <- sapply(strsplit(Nexisdata_df0$Subject, ";"), function(x) x[1])
Nexisdata_df0$mainsubject <- gsub("\\(.*", "", Nexisdata_df0$mainsubject) 

Nexisdata_df0$mainregion <- sapply(strsplit(Nexisdata_df0$Geographic, ";"), function(x) x[1])
Nexisdata_df0$mainregion <- gsub("\\(.*", "", Nexisdata_df0$mainregion)


# assign new order 
Nexisdata_df0$order <- c(1:nrow(Nexisdata_df0))


# reorder the columns 
Nexisdata_df0 <- Nexisdata_df0 %>%
  select("order","Art_ID", "mainsubject", "mainregion", everything())


#Remove irrelevant section (this metadata is provided by News providers)
Nexisdata_df00 <- subset(Nexisdata_df0,
                             !grepl(paste(c("Sport", "SPORT"),
                                          collapse = "|"), Section))

Nexisdata_df00 <- subset(Nexisdata_df00, 
                           !grepl("SOCCER|CRICKET",mainsubject, ignore.case = TRUE))



# Remove irrelevant regions (this metadata is auto-generated by Nexis Uni)
Nexisdata_df00 <- subset(Nexisdata_df00, 
                           grepl(paste(c("UNITED KINGDOM", "ENGLAND"), 
                                       collapse = "|"), mainregion, ignore.case = TRUE))

Nexisdata_df00 <- subset(Nexisdata_df00, 
                           !grepl("NEW SOUTH WALES", mainregion, ignore.case = TRUE))


#Remove rows with no valid date metadata 
Nexisdata_df00 <- Nexisdata_df00 %>% drop_na(Date)

Filtering process 2: Broadsheets, Tabloids, Printed, Online

monthly_count_Nexis_summary <- table(Nexisdata_df00$Newspaper) %>% 
  as.data.frame() %>% arrange(desc(Freq))


monthly_count_Nexis_summary <- monthly_count_Nexis_summary %>%
  mutate(Tabloid = ifelse(str_detect(tolower(Var1), 
                                     'telegraph|independent|times|guardian|financial'), 0, 
                         ifelse(str_detect(tolower(Var1), 'mail|mirror|sun'), 1, 9)))


monthly_count_Nexis_summary <- monthly_count_Nexis_summary %>% subset(!Tabloid == 9)


monthly_count_Nexis_summary <- monthly_count_Nexis_summary %>%
  mutate(Online = ifelse(str_detect(tolower(Var1), '.co.uk|.com|online'), 1, 0))



Nexisdata_df000 <- Nexisdata_df00 %>% 
  separate(col = Date, into = c("year", "month", "daytime"), sep = "-")

Nexisdata_df000[, c("year","month")] <- sapply(Nexisdata_df000[, c("year","month")], 
                                                as.numeric)

Nexisdata_df1 <- Nexisdata_df000 %>% 
  dplyr::filter(Newspaper %in% monthly_count_Nexis_summary$Var1)

paste0("Number of valid articles after filtering: ", nrow(Nexisdata_df1))
## [1] "Number of valid articles after filtering: 836"

1.4 Overview data

Year-monthly article count

data3_Nexis <- Nexisdata_df1 %>% group_by(year, month) %>%  summarize(count = n()) 

monthly_count_Nexis <- data.frame(year = rep(2000:2023, each = 12), month = rep(1:12, 24))

monthly_count_Nexis <- left_join(monthly_count_Nexis, data3_Nexis)

monthly_count_Nexis[is.na(monthly_count_Nexis)] = 0


monthly_count_Nexis[, c("year", "month")] <- sapply(monthly_count_Nexis[, c("year", "month")], as.character)


monthly_count_Nexis <- monthly_count_Nexis %>%
  mutate(ym = paste0(year, "-", month))


monthly_count_Nexis %>% 
  
  mutate(month = fct_relevel(month, unique(monthly_count_Nexis$month))) %>%
  
  ggplot(aes(x=month, y=count)) +
  
  geom_bar(stat='identity', fill="forestgreen") + 
  
  labs(title = "Number of articles about drought") +
  
  theme_light()+
  
  facet_wrap(~year,  ncol=8) + 
  
  theme(strip.background =element_rect(fill="white")) +
  
  theme(strip.text = element_text(colour = 'black'))

2 Data collection: Temp & Prcp data

2.1 Retrieve downloaded data

# Mean Central England Temperature (MCET)
#https://www.metoffice.gov.uk/hadobs/hadcet/data/meantemp_monthly_totals.txt
List_UK_temp <- read.xlsx(paste0("./ScrapedData/MET/Longterm_TP.xlsx"), sheetName = "MCET")


# Monthly England and Wales precipitation (MEWP)
#https://www.metoffice.gov.uk/hadobs/hadukp/data/monthly/HadEWP_monthly_totals.txt)
List_UK_prcp <- read.xlsx(paste0("./ScrapedData/MET/Longterm_TP.xlsx"), sheetName = "MEWP")

2.2 Restructure Data

# calculate anomaly based on 1991-2020 average
List_UK_temp_subset_ref <- List_UK_temp %>% subset(Year %in% c(1991:2020)) %>% 
  select(-Year, -Annual)

UK_temp_ref_mean <- colMeans(List_UK_temp_subset_ref)

List_UK_prcp_subset_ref <- List_UK_prcp %>% subset(Year %in% c(1991:2020)) %>% 
  select(-Year, -Annual)

UK_prcp_ref_mean <- colMeans(List_UK_prcp_subset_ref)


# restructure data 
List_UK_temp_subset_target <- List_UK_temp %>% subset(Year %in% c(2000:2023)) %>% 
  select(-Annual)

temp_target_anomaly <- sweep(List_UK_temp_subset_target[2:13], 2, UK_temp_ref_mean, "-")

temp_target_anomaly$Year <- List_UK_temp_subset_target$Year

temp_target_anomaly_long <- temp_target_anomaly %>% 
  pivot_longer(cols = -Year, names_to = "Month", values_to = "Value")

temp_target_anomaly_long$month <- rep(1:12, length(2000:2023))

temp_target_anomaly_long <- temp_target_anomaly_long %>% subset(Year %in% c(2000:2023))


List_UK_prcp_subset_target <- List_UK_prcp %>% subset(Year %in% c(2000:2023)) %>% 
  select(-Annual)

prcp_target_anomaly <- sweep(List_UK_prcp_subset_target[2:13], 2, UK_prcp_ref_mean, "-")

prcp_target_anomaly$Year <- List_UK_prcp_subset_target$Year

prcp_target_anomaly_long <- prcp_target_anomaly %>% 
  pivot_longer(cols = -Year, names_to = "Month", values_to = "Value")

prcp_target_anomaly_long$month <- rep(1:12, length(2000:2023))

prcp_target_anomaly_long <- prcp_target_anomaly_long %>% subset(Year %in% c(2000:2023))


# add temp and prcp to Nexis counts  
monthly_count_Nexis$temp_anomaly <- temp_target_anomaly_long$Value

monthly_count_Nexis$prcp_anomaly <- prcp_target_anomaly_long$Value


# remove the month of 9,10,11,12 of 2023
monthly_count_Nexis <- head(monthly_count_Nexis, n = nrow(monthly_count_Nexis) - 4) 

2.3 Calculate anomaly

# create new values considering temporal lags 
monthly_count_Nexis$temp_2mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly,width = 2, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_3mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly,width = 3, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_3sum <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 3, 
                                                FUN = sum, fill = 0, align = "right")

monthly_count_Nexis$temp_4mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 4, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_5mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 5, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_6mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 6, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$temp_12mean <- zoo::rollapply(monthly_count_Nexis$temp_anomaly, width = 12, 
                                                 FUN = mean, fill = 0, align = "right")


monthly_count_Nexis$prcp_2sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly,width = 2, 
                                                FUN = sum, fill = 0, align = "right")

monthly_count_Nexis$prcp_3mean <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 3, 
                                                 FUN = mean, fill = 0, align = "right")

monthly_count_Nexis$prcp_3sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 3, 
                                                FUN = sum, fill = 0, align = "right")

monthly_count_Nexis$prcp_4sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 4, 
                                                FUN = sum, fill = 0, align = "right")

monthly_count_Nexis$prcp_5sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 5, 
                                                FUN = sum, fill = 0, align = "right")

monthly_count_Nexis$prcp_6sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 6, 
                                                FUN = sum, fill = 0, align = "right")

monthly_count_Nexis$prcp_12sum <- zoo::rollapply(monthly_count_Nexis$prcp_anomaly, width = 12, 
                                                FUN = sum, fill = 0, align = "right")

3 Correlation between media coverage and meteorological data

3.1 Entire dataset

3.1.1 Correlation plot with 6-month window (Fig. 1)

monthly_count_Nexis %>%
  mutate(zero = ifelse(count == 0, 0, 9)) %>% 
  
  ggplot(aes(x=temp_6mean, y=prcp_6sum)) +
  
  geom_point(color = "black", stroke = 0.2, shape = 21, 
             aes(fill = factor(zero), size = count, alpha = count)) +
  
  scale_fill_manual(values = c("white","darkblue")) +

  scale_size_continuous(
    range = c(0, 10)  
    ) +
  
  labs(
    x = "Temp. Anomaly: 6month",
    y = "Prcp. Anomaly: 6month",
    title = paste0("Correlation plot: count of articles, n = ", 
                   sum(monthly_count_Nexis$count))
  ) +
  
  guides(fill = "none", 
         count = "none",
         size = guide_legend(title = waiver())) +
  
  theme_bw()  +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +  
  
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +

  xlim(-3.5, 3.5) +   ylim(-250, 250)

3.1.2 Multinomial test

#H0: A categorical variable follows a hypothesized distribution.
#HA: A categorical variable does not follow the hypothesized distribution.

# when the p-value is less than the significance level (e.g. α = .05),
# we reject the null hypothesis


temp_default_qd1 <- monthly_count_Nexis %>% 
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd2 <- monthly_count_Nexis %>% 
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd3 <- monthly_count_Nexis %>% 
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd4 <- monthly_count_Nexis %>% 
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 


temp_default_qd <- c(length(unique(temp_default_qd1$ym)), 
                     length(unique(temp_default_qd2$ym)),
                     length(unique(temp_default_qd3$ym)), 
                     length(unique(temp_default_qd4$ym)))

temp_default_sum <- sum(temp_default_qd)

temp_default_qd <- (temp_default_qd)/temp_default_sum

temp_default_qd
## [1] 0.2616487 0.2007168 0.1648746 0.3727599
temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count), 
                   sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1]  69  42  82 643
chisq.test(temp_observed_qd, p=temp_default_qd)
## 
##  Chi-squared test for given probabilities
## 
## data:  temp_observed_qd
## X-squared = 571.8, df = 3, p-value < 2.2e-16

3.1.3 Correlation plot with different windows (Supplementary 2)

1-month window

monthly_count_Nexis %>%
  mutate(zero = ifelse(count == 0, 0, 9)) %>% 
  
  ggplot(aes(x=temp_anomaly, y=prcp_anomaly)) +
  
  geom_point(color = "black", stroke = 0.2, shape = 21, 
             aes(fill = factor(zero), size = count, alpha = count)) +
  
  scale_fill_manual(values = c("white","darkblue")) +

  scale_size_continuous(
    range = c(0, 10)  
    ) +
  
  labs(
    x = "Temp. Anomaly: 1month",
    y = "Prcp. Anomaly: 1month",
    title = paste0("Correlation plot: count of articles, n = ", 
                   sum(monthly_count_Nexis$count))
  ) +
  
  guides(fill = "none", 
         count = "none",
         size = guide_legend(title = waiver())) +
  
  theme_bw()  +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +  
  
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +

  xlim(-3.5, 3.5) +   ylim(-250, 250)

3-month window

monthly_count_Nexis %>%
  mutate(zero = ifelse(count == 0, 0, 9)) %>% 
  
  ggplot(aes(x=temp_3mean, y=prcp_3sum)) +
  
  geom_point(color = "black", stroke = 0.2, shape = 21, 
             aes(fill = factor(zero), size = count, alpha = count)) +
  
  scale_fill_manual(values = c("white","darkblue")) +

  scale_size_continuous(
    range = c(0, 10)  
    ) +
  
  labs(
    x = "Temp. Anomaly: 3month",
    y = "Prcp. Anomaly: 3month",
    title = paste0("Correlation plot: count of articles, n = ", 
                   sum(monthly_count_Nexis$count))
  ) +
  
  guides(fill = "none", 
         count = "none",
         size = guide_legend(title = waiver())) +
  
  theme_bw()  +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +  
  
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +

  xlim(-3.5, 3.5) +   ylim(-250, 250)

12-month window

monthly_count_Nexis %>%
  mutate(zero = ifelse(count == 0, 0, 9)) %>% 
  
  ggplot(aes(x=temp_12mean, y=prcp_12sum)) +
  
  geom_point(color = "black", stroke = 0.2, shape = 21, 
             aes(fill = factor(zero), size = count, alpha = count)) +
  
  scale_fill_manual(values = c("white","darkblue")) +

  scale_size_continuous(
    range = c(0, 10)  
    ) +
  
  labs(
    x = "Temp. Anomaly: 12month",
    y = "Prcp. Anomaly: 12month",
    title = paste0("Correlation plot: count of articles, n = ", 
                   sum(monthly_count_Nexis$count))
  ) +
  
  guides(fill = "none", 
         count = "none",
         size = guide_legend(title = waiver())) +
  
  theme_bw()  +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +  
  
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.2) +

  xlim(-3.5, 3.5) +   ylim(-250, 250)

3.2 Seasonality test

3.2.1 Define seasonality

monthly_count_Nexis <- monthly_count_Nexis %>%
  
  mutate(season = if_else(month %in% c(9, 10, 11), "SON",
                              if_else(month %in% c(12, 1,2), "DJF", 
                                      if_else(month %in% c(3,4,5), "MAM", 
                                              if_else(month %in% c(6,7,8), "JJA", NA)))))


monthly_count_Nexis$season <- factor(monthly_count_Nexis$season, 
                                     levels=c("SON", "DJF", "MAM","JJA"))

3.2.2 Plot in a facet wrap (Fig. 2)

monthly_count_Nexis %>%
  mutate(zero = ifelse(count == 0, 0, 9)) %>% 
  
  ggplot(aes(x=temp_6mean, y=prcp_6sum, group=season)) + 
  
  geom_point(color = "black", stroke = 0.2, shape = 21, 
             aes(fill = factor(zero), size = count, alpha = count)) +
  
  scale_fill_manual(values = c("white","darkblue")) +
  
  labs(
    x = "Temp. Anomaly: 6month",
    y = "Prcp. Anomaly: 6month",
    title = "Correlation plot: count of articles by seasonality",
    subtitle = paste0("SON: ", sum(subset(monthly_count_Nexis, season == "SON")$count), ", ", 
                      "DJF: ", sum(subset(monthly_count_Nexis, season == "DJF")$count), ", ", 
                      "MAM: ", sum(subset(monthly_count_Nexis, season == "MAM")$count), ", ", 
                      "JJA: ", sum(subset(monthly_count_Nexis, season == "JJA")$count))
  ) +
  
  scale_size_continuous(
    name = "count",
    range = c(1, 10), 
    guide = guide_legend(      
      title.position = "top",  
      label.position = "right") 
    ) +
  
  facet_wrap(~season) +
  
  guides(color = "none",
         fill = "none",
         size = guide_legend(title = waiver())) +
  
  theme_bw()  +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.3) +  
  
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 0.3) +
  
  xlim(-3.5, 3.5) +   ylim(-250, 250)

3.2.3 Multinomial test by seasonality

SON

target_season <- "SON"

temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd <- c(length(unique(temp_default_qd1$ym)), 
                     length(unique(temp_default_qd2$ym)),
                     length(unique(temp_default_qd3$ym)), 
                     length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 12 17 10 30
temp_default_sum <- sum(temp_default_qd)

temp_default_qd <- (temp_default_qd)/temp_default_sum

temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count), 
                   sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1]  5  5  5 40
chisq.test(temp_observed_qd, p=temp_default_qd)
## 
##  Chi-squared test for given probabilities
## 
## data:  temp_observed_qd
## X-squared = 19.504, df = 3, p-value = 0.000215

DJF

target_season <- "DJF"

temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd <- c(length(unique(temp_default_qd1$ym)), 
                     length(unique(temp_default_qd2$ym)),
                     length(unique(temp_default_qd3$ym)), 
                     length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 19 12 10 28
temp_default_sum <- sum(temp_default_qd)

temp_default_qd <- (temp_default_qd)/temp_default_sum

temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count), 
                   sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1] 14  9  4 88
chisq.test(temp_observed_qd, p=temp_default_qd)
## 
##  Chi-squared test for given probabilities
## 
## data:  temp_observed_qd
## X-squared = 62.142, df = 3, p-value = 2.048e-13

MAM

target_season <- "MAM"

temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd <- c(length(unique(temp_default_qd1$ym)), 
                     length(unique(temp_default_qd2$ym)),
                     length(unique(temp_default_qd3$ym)), 
                     length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 22 12 13 22
temp_default_sum <- sum(temp_default_qd)

temp_default_qd <- (temp_default_qd)/temp_default_sum

temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count), 
                   sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1]  30  11  48 206
chisq.test(temp_observed_qd, p=temp_default_qd)
## 
##  Chi-squared test for given probabilities
## 
## data:  temp_observed_qd
## X-squared = 209.55, df = 3, p-value < 2.2e-16

JJA

target_season <- "JJA"

temp_default_qd1 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd2 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum > 0 ) 

temp_default_qd3 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean < 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd4 <- monthly_count_Nexis %>% dplyr::filter(season == target_season) %>%
  dplyr::filter(temp_6mean > 0 ) %>% dplyr::filter(prcp_6sum < 0 ) 

temp_default_qd <- c(length(unique(temp_default_qd1$ym)), 
                     length(unique(temp_default_qd2$ym)),
                     length(unique(temp_default_qd3$ym)), 
                     length(unique(temp_default_qd4$ym)))
temp_default_qd
## [1] 20 15 13 24
temp_default_sum <- sum(temp_default_qd)

temp_default_qd <- (temp_default_qd)/temp_default_sum

temp_observed_qd <- c(sum(temp_default_qd1$count), sum(temp_default_qd2$count), 
                   sum(temp_default_qd3$count), sum(temp_default_qd4$count))
temp_observed_qd
## [1]  20  17  25 309
chisq.test(temp_observed_qd, p=temp_default_qd)
## 
##  Chi-squared test for given probabilities
## 
## data:  temp_observed_qd
## X-squared = 418.03, df = 3, p-value < 2.2e-16

Archive and remove data

rm(target_season, temp_default_qd1, temp_default_qd2, temp_default_qd3, temp_default_qd4,
   temp_default_qd, temp_default_sum)

write.xlsx(monthly_count_Nexis, 
           file = paste0("./ProcessedData/Nexisdata_summary.xlsx"))

3.3 Correlation heatmap

3.3.1 Create a normalization function

winsorize <- function(x, p = c(0.01, 0.99)) {
  q <- quantile(x, probs = p, na.rm = TRUE)
  x[x < q[1]] <- q[1]
  x[x > q[2]] <- q[2]
  x
}

3.3.2 Restructure data

monthly_count_Nexis$ym <- as.yearmon(monthly_count_Nexis$ym)

monthly_count_Nexis$ym <- as.character(monthly_count_Nexis$ym)

monthly_count_Nexis$year <- as.numeric(monthly_count_Nexis$year)

monthly_count_Nexis$month <- as.numeric(monthly_count_Nexis$month)


monthly_count_Nexis_0 <- data.frame("ym" = monthly_count_Nexis$ym, 
                                    "year" = monthly_count_Nexis$year,
                                    "month" = monthly_count_Nexis$month,
                                    "count" = monthly_count_Nexis$count, 
                                    "source" = "count.article")

monthly_count_Nexis_0$count_rescale2 <- rescale(winsorize(monthly_count_Nexis_0$count))


# 6-month mean temp 
monthly_count_Nexis_T <- data.frame("ym" = monthly_count_Nexis$ym, 
                                    "year" = monthly_count_Nexis$year,
                                    "month" = monthly_count_Nexis$month,
                                    "count" = round(monthly_count_Nexis$temp_6mean, digits = 1),
                                    "source" = "temp.6mo.avg")

monthly_count_Nexis_T$count_rescale2 <- rescale(monthly_count_Nexis_T$count)



# 6-month sum prcp 
monthly_count_Nexis_P <- data.frame("ym" = monthly_count_Nexis$ym, 
                                    "year" = monthly_count_Nexis$year,
                                    "month" = monthly_count_Nexis$month,
                                    "count" = round(monthly_count_Nexis$prcp_6sum, digits = 0), 
                                    "source" = "prcp.6mo.sum")

monthly_count_Nexis_P$count_rescale2 <- rescale(monthly_count_Nexis_P$count)

3.3.3 Correlation analysis (Fig. 3)

temp_stat <- cor.test(monthly_count_Nexis$count, round(monthly_count_Nexis$temp_6mean, 3))

paste0("Temperature in 6-month window: coefficient ", round(temp_stat$estimate,3), 
       ", p-value ", round(temp_stat$p.value, 3))
## [1] "Temperature in 6-month window: coefficient 0.146, p-value 0.014"
prcp_stat <- cor.test(monthly_count_Nexis$count, round(monthly_count_Nexis$prcp_6sum, 3))

paste0("Precipitation in 6-month window: coefficient ", round(prcp_stat$estimate,3), 
       ", p-value ", round(prcp_stat$p.value, 3))
## [1] "Precipitation in 6-month window: coefficient -0.206, p-value 0"

Plot the anomaly trend

plot_weather2 <- monthly_count_Nexis %>%
  
  mutate(ym = factor(ym, levels=ym)) %>%
  
  mutate(temp_6mean_zscore = as.vector(scale(temp_6mean))) %>%
  
  mutate(prcp_6sum_zscore = as.vector(scale(prcp_6sum))) %>%
  
  mutate(order = 1:n()) %>%
  
  mutate(both_conditions_met = as.vector(temp_6mean_zscore > 0 & prcp_6sum_zscore < 0)) %>%
  
  ggplot(aes(x = order)) +
  
  geom_line(aes(y = temp_6mean_zscore, color = "temp_6mean_zscore"), group = 1) +
  
  geom_line(aes(y = prcp_6sum_zscore, color = "prcp_6sum_zscore"), group = 2) +
  
  scale_color_manual(values = c("#2c7bb6", "#d7191c"),
                     labels = c("Prcp anomaly: 6 month sum", "Temp anomaly: 6 month average")) +
  labs(x = " ", y = "Z-score") +
  
  geom_hline(yintercept = 0, color = "black", linetype = "dashed", size = 0.2) +
  
  ylim(-4, 4) +
  
  theme_bw() +

  # Apply conditional coloring
  geom_rect(aes(xmin = order - 0.5, xmax = order + 0.5, ymin = -Inf, ymax = Inf, 
                fill = both_conditions_met), alpha = 0.5) +
  
  scale_fill_manual(values = c(`TRUE` = "gold", `FALSE` = "white"), guide = FALSE) +  
  
  scale_x_continuous(
    expand = c(0,0),
    breaks = seq(1, nrow(monthly_count_Nexis), by = 12),
    labels = monthly_count_Nexis$ym[seq(1, nrow(monthly_count_Nexis), by = 12)]
  ) +
  
  guides(color = guide_legend(
    title = NULL,
    title.position = "top",
    keywidth = 1,
    keyheight = 1
  )) +
  
  theme(
    panel.background = element_blank(),  
    panel.border = element_blank(),  
    axis.text.x = element_text(angle = 30, vjust = 0.5, hjust=0.5),
    legend.background = element_blank(),
    legend.title = element_blank(),
    plot.background = element_blank()  
  )
  
  
  

plot_media2 <- monthly_count_Nexis %>% 
  
  mutate(ym = factor(ym, levels=ym)) %>% 
  
  ggplot(aes(x = ym, y = count, fill = "count")) + 

  geom_bar(stat = "identity") + 

  scale_x_discrete(breaks = function(x) x[seq(1, length(x), by = 12)]) +
  
  scale_fill_manual(values = "darkblue",
                     labels = "Article count: 1 month") +
  
  theme_bw() +
  
  labs(
    x = " ",
    y = "Count") + 

  theme(axis.text.x = element_blank(),
        legend.title = element_blank(),
        legend.background = element_blank()) 



ggpubr::ggarrange(plot_media2, plot_weather2, 
                  ncol = 1, nrow = 2, font.label = list(size = 12),
                  align = "v")

3.3.4 Plot a heatmap (Fig. 4)

ggplot() + 
  
  # temp6mean
  geom_point(
    data = monthly_count_Nexis_T,
    aes(month, source, color=count, size = 6)) +
  
  scale_color_gradient2(low = "white", mid = "white", high = "darkorange") +
  
  geom_text(data = monthly_count_Nexis_T,
            aes(month, source, label = count), 
            color = "black", size = 2) +
  
  
  # prcp6sum
  new_scale_color() +
  
  geom_point(
    data = monthly_count_Nexis_P,
    aes(month, source, color=count, size = 6)) +
  
  scale_color_gradient2(low = "brown", mid = "white", high = "white") +
  
  geom_text(data = monthly_count_Nexis_P,
            aes(month, source, label = count), 
            color = "black", size = 2) +
  

  # article count  
  new_scale_fill() +
  geom_tile(
    data = monthly_count_Nexis_0,
    aes(month, source, fill=count_rescale2)
  ) +
  
  scale_fill_gradient(low = "white", high = "darkblue") + 

  geom_text(data = monthly_count_Nexis_0,
            aes(month, source, label = count),
            color = ifelse(monthly_count_Nexis_0$count > 30, 
                           "white", "black"), size = 2) +
  
  facet_wrap(~year) +
  
  theme_minimal() +
  
  labs(x = " ", y = " ", fill = " ") +
  
  scale_x_continuous(breaks = 1:12, labels = month, 
                     position = "top", 
                     expand = c(0,0)) + 
  
  theme(legend.position = "none", 
        
        panel.grid.major = element_blank(),
        
        panel.grid.minor = element_blank(), 
        
        strip.placement = "outside", 
        
        panel.spacing.y = unit(0.5, "lines"), 
        
        panel.spacing.x = unit(0.5, "lines"),
        
        strip.text.x = element_text(size = 8, face = "bold"), 
        
        axis.text.x = element_text(size = 6, face = "bold.italic") 
        )

4 Topic modeling

STM tutorial by Roberts et al., 2019 (DOI: https://doi.org/10.18637/jss.v091.i02)

4.1 NLP pre-processing

Subset data for target year

reference <- data.frame("year" = rep(c(2012, 2022), each = 6),
                        "month" = c(1,2,3,4,5,6,       # for 2012
                                    7,8,9,10,11,12))    # for 2022


datarange <- list("2012" = "Jan to Jun, 2012", 
                "2022" = "July to Dec, 2022")


data_input <- Nexisdata_df1

Initiate the NLP model

#spacy_download_langmodel(model = "en_core_web_trf")
spacy_initialize(model = "en_core_web_trf") 

Proceed with NLP

# NLP. target text includes title, subtitle, and body text
spacy_0 <- spacy_parse(paste0(data_input$Headline, ". ", data_input$Sentence), 
                       
                       dependency = T, nounphrase = TRUE) 


# extract named entities 
data_entity <- entity_extract(spacy_0, type = "named", concatenator = "_")

data_entity$ngram <- sapply(strsplit(data_entity$entity, "_"), length)

data_entity_consolidate <- udpipe::txt_recode_ngram(spacy_0$token, 
                                                    compound = data_entity$entity, 
                                                    ngram = data_entity$ngram, 
                                                    sep = "_") %>% as.data.frame()

colnames(data_entity_consolidate) <- "entity_result"

spacy_11 <- cbind(spacy_0, data_entity_consolidate)


# consolidate selected NERs. we keep "named" entities 
spacy_11$entity[spacy_11$entity %in% 
                  c("DATE_B", "DATE_I", "LANGUAGE_B", "LANGUAGE_I", "QUANTITY_I", 
                    "QUANTITY_B", "CARDINAL_I", "CARDINAL_B", "ORDINAL_B", 
                    "ORDINAL_I", "TIME_B", "TIME_I", "MONEY_I", "MONEY_B", 
                    "PERCENT_B", "PERCENT_I")] <- "" 

spacy_11$final <- with(spacy_11, ifelse(entity == "", lemma, entity_result)) 

spacy_11 <- spacy_11 %>% drop_na() 


# assign a new document-sentence ID
spacy_11$docsen <- paste0(spacy_11$doc_id, "_", spacy_11$sentence_id) 


# stop-word removal. sw lexicon here is the same as snowball from 'stopword' R package 
sw <- quanteda::stopwords("english") %>% data.frame() 

colnames(sw) <- "final"

spacy_2 <-  spacy_11 %>% dplyr::anti_join(sw, by = "final")


# more data cleaning 
spacy_2 <- filter(spacy_2, spacy_2$pos != "PUNCT") #punctuation 

spacy_2 <- filter(spacy_2, nchar(spacy_2$final) != 1) # single text removal 

spacy_2 <- spacy_2[!(spacy_2$final %in% 
                       c("I", "’", "-", "•", "'", "/", "’s", "'s", "’re", "’ve", "n't")),]


# save the dataframes
saveRDS(spacy_11, paste0("./ProcessedData/N_drought+england_ALL_spacy1.rds"))
saveRDS(spacy_2, paste0("./ProcessedData/N_drought+england_ALL_spacy2.rds"))

4.2 Get data ready

spacy_2 <- readRDS(paste0("./ProcessedData/N_drought+england_ALL_spacy2.rds"))

# filter nouns and return tokens into sentences 
x <- spacy_2 %>% 
  filter(pos %in% c("NOUN")) %>% 
  group_by(doc_id) %>% 
  mutate(text_string = paste(final, collapse = " ")) %>% 
  select(doc_id, text_string) %>% unique()


# label the source of news article: newspaper or tabloid? 
temp <- data_input %>% select(Art_ID, year, month, Newspaper)

temp <- temp %>%
  mutate(source = ifelse(str_detect(tolower(Newspaper), 
                                    'telegraph|independent|times|guardian|financial'), 
                         "broadsheet", 
                         ifelse(str_detect(tolower(Newspaper), 
                                           'mail|mirror|sun'), "tabloid", "NA")))

temp$doc_id <- paste0("text", c(1:nrow(temp)))


# merge two above dataset
x <- left_join(x, temp, by = "doc_id")


# tokenize text strings 
x_token <- x$text_string %>% 
  tokens(what = "fastestword") # this line works the same as: stri_split_fixed(x$text_string, " ")


# create a data frame optimal for topic modeling 
x_dfm <- dfm(x_token, tolower = F)
x_stm <- convert(x_dfm, to = "stm")

4.3 Define the number of k (Supplementary 3)

this part is optional the final plot may give different results every run. to learn more, try ?searchK()

# give the k to test the optimal number of topics 
K1 <- c(4, 6, 8, 10, 12, 15, 20, 30)


# calculate semantic coherence, exclusivity, heldout, and residual 
set.seed(123)
x_fit1 <- searchK(x_stm$documents, x_stm$vocab, K = K1, verbose = F)

x_plot1 <- data.frame("K" = K1, 
                      
                      "Coherence" = unlist(x_fit1$results$semcoh),  
                      
                      "Exclusivity" = unlist(x_fit1$results$exclus), 
                      
                      "heldout" = unlist(x_fit1$results$heldout),
                      
                      "residual" = unlist(x_fit1$results$residual))


# plot the result 
x_plot1 <- reshape2::melt(x_plot1, id = c("K"))

ggplot(x_plot1, aes(K, value, color = variable, group = 1)) +
  
  geom_point(show.legend = FALSE) +
  
  geom_line(show.legend = FALSE) + #size = 0.5, 
  
  scale_x_continuous(breaks = unique(x_plot1$K)) +
  
  facet_wrap(~variable, scales = "free_y") + 
  
  theme_bw() +
  
  labs(x = "Number of topics K", 
       title = "Statistical fit of models with different K")

4.4 Run the stm with optimal k

4.4.1 run the model

optimalK <- 10

set.seed(123)

x_model <- stm(documents = x_stm$documents, 
               vocab = x_stm$vocab, 
               K = as.numeric(optimalK),
               data = x, 
               verbose = F) 

4.4.2 Review the topical proportions (Supplementary 4a)

plot.STM(x_model, type = "summary") 

4.5 Understand topic model reseults (Table 1)

Filter keywords per topic

# filter top 50 words 
x_topic <- labelTopics(x_model, n = 50) 

x_topic_prob <- data.frame("features" = t(x_topic$prob)) 
colnames(x_topic_prob) <- paste("Topic", c(1:as.numeric(optimalK)))

x_topic_frex <- data.frame("features" = t(x_topic$frex)) 
colnames(x_topic_frex) <- paste("Topic", c(1:as.numeric(optimalK)))


# example of probability-based keywords
x_topic_prob %>% head(10)
##     Topic 1   Topic 2  Topic 3    Topic 4 Topic 5     Topic 6 Topic 7
## 1     water    farmer    water      water   water        rain    year
## 2   drought      crop    plant       year company temperature drought
## 3     river      year   garden    climate    year        week    time
## 4     level      food     tree  reservoir drought     weather   thing
## 5  rainfall     price  drought      river  supply         day     day
## 6   company   drought     lawn     change     day       flood  school
## 7      area   weather   summer    drought   litre       water  people
## 8       ban condition      ban government    leak     drought    rain
## 9  hosepipe    potato hosepipe   wildlife   order    flooding    goal
## 10     year vegetable     butt       plan     use        area  garden
##        Topic 8  Topic 9 Topic 10
## 1        water     year     rain
## 2          ban     cent     year
## 3     hosepipe      per rainfall
## 4     customer     home    month
## 5      drought  drought  weather
## 6      company    flood      per
## 7  temperature property     cent
## 8          day  village  drought
## 9         area flooding   record
## 10        week  climate    water
x_topic_frex %>% head(10)
##        Topic 1   Topic 2      Topic 3     Topic 4      Topic 5      Topic 6
## 1       agency    barley    butterfly      beaver         leak         hail
## 2        river    potato      robinia     wetland        meter        flash
## 3         flow     wheat         leaf        vole       target thunderstorm
## 4       status   harvest     bluebell        toad        order      tornado
## 5  groundwater livestock        leave     lapwing desalination         wind
## 6  environment      crop pseudoacacia        bird         bill     downpour
## 7  restriction    grower        berry     habitat        litre     flooding
## 8       permit     yield          ivy  population     metering        alert
## 9        level    farmer         lawn abstraction         pipe    lightning
## 10     drought    carrot          pot      specie      leakage        blaze
##      Topic 7  Topic 8    Topic 9 Topic 10
## 1       lock customer subsidence       mm
## 2       cave     pool      claim     rain
## 3    prairie   health    insurer  average
## 4     trophy     fire    council     snow
## 5       goal hosepipe      barge rainfall
## 6       show      ban    village     inch
## 7     school   bottle      beach    month
## 8       game wildfire    defence   pollen
## 9   defender heatwave       ship  aquifer
## 10 housewife     fine        map   figure

4.6 Post processing

Determine the dominant topic for articles

# assign topical probability to each document  
theta <- x_model[["theta"]] %>% as.data.frame()


# what is the maximum probability? 
theta$max <- apply(theta, 1, max, na.rm=TRUE)
theta$temp_topic <- colnames(theta)[apply(theta,1,which.max)]


# does the max prob exceed the 3*(1/optimalK[y0])? 
theta$final_topic <- colnames(theta)[apply(theta[, c(1:as.numeric(optimalK))],1,which.max)]
theta$final_topic[which(theta$max < 3*(1/as.numeric(optimalK)))] <- NA  #threshold to be 3*(1/optimalK[y0]) 


#add relevant metadata 
theta$year <- x$year

theta$month <- x$month

theta$Art_ID <- x$Art_ID

theta$source <- x$source

4.7 Visualize topic modeling results

4.7.1 Create a reference

# color palette function 
get_Palette <- colorRampPalette(brewer.pal(12, "Set3"))

colorpalette <- get_Palette(as.numeric(optimalK))


# elements for legend 
topic_label <- c()

for (k1 in c(1:optimalK)) {
  temp <- paste0(k1, ". ", x_topic_prob[1,k1], 
                 ", ", x_topic_prob[2,k1], 
                 ", ", x_topic_prob[3,k1])
  
  topic_label <- c(topic_label, temp)
  rm(temp, k1)
}

temp_order <- make.dt(x_model)
temp_order <- summarize_all(temp_order, mean)
temp_order <- temp_order[, -c(1)]
temp_order <- unlist(temp_order)
temp_order <- order(temp_order, decreasing = TRUE)

topic_label <- topic_label[temp_order] # the order follows the result from plot.STM()
topic_level <- paste0("V", temp_order) # the order follows the result from plot.STM()    


topic_label_new <- c("#1.Drought status", "#2.Agriculture", "#3.Gardening", 
                     "#4.Aquatic ecosystem","#5.Water company", "#6.Flooding", 
                     "#7.N/A", "#8.Hosepipe ban", "#9.N/A", "#10.Rain forecast")

topic_label_new <- topic_label_new[temp_order]

4.7.2 Topic composition by month (Supplementary 4b)

Generate a summary file

theta_summary3 <- theta %>% 
  
  complete(year, month, final_topic, fill = list(z = NA)) %>% 
  
  group_by(year, month, final_topic, .drop = F) %>% 
  
  summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>% 
  
  group_by(month) %>% 
  
  mutate(ratio = count/sum(count)) %>% 
  
  drop_na() %>%
  
  mutate(ym = paste0(year, "-", month))

theta_summary3$month <- factor(theta_summary3$month) 

theta_summary3 <- theta_summary3 %>% as.data.frame()

count_bymonth3 <- aggregate(count ~ ym, data = theta_summary3, FUN = sum) %>% 
  select(count) %>% as.vector()

Plot by count

ggplot(theta_summary3, 
       aes(x = month, y = count, fill = factor(final_topic, levels = topic_level))) + 
  
  geom_bar(stat = "identity", position = position_stack(reverse = T)) + 
  
  theme_bw() + 
  
  xlab(" ") + 
  
  ylab("count")+ 
  
  facet_wrap(~year,  ncol=5) + 
  
  scale_fill_manual(label = topic_label, values = colorpalette, na.value = "gray70") +
  
  theme(legend.position = "right", legend.title = element_blank(),
        
        plot.title = element_text(size=12, face = "bold"),
        
        plot.subtitle =element_text(size=10, face = "italic")) 

4.7.3a Visualize by month: 2012 (Fig. 5)

Set a target year

y <- 2012
y0 <- "2012"

theta_summary2 <- theta %>% 
  
  dplyr::filter(year == y) %>% 
  
  dplyr::filter(month %in% reference$month[reference$year == y]) %>% 

  
  complete(month, final_topic, fill = list(z = NA)) %>% 
  
  group_by(month, final_topic, .drop = F) %>% 
  
  summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>% 
  
  group_by(month) %>% 
  
  mutate(ratio = count/sum(count)) %>% 
  
  drop_na() %>% 
  
  as.data.frame()

count_bymonth <- aggregate(count ~ month, data = theta_summary2, FUN = sum) %>% 
  select(count) %>% as.vector()

Plot by count – treemap

temp_palette <- data.frame(final_topic = topic_level,
                           colors = colorpalette, 
                           topic_label0 = topic_label_new)

theta_summary2 <- left_join(theta_summary2, temp_palette, by = "final_topic")

theta_summary2 <- theta_summary2 %>%
  mutate(month_abb = month.abb[month])

treemap(theta_summary2,

        index = c("month_abb", "topic_label0"),

        vSize = "count",

        type = "color",

        vColor = "colors",

        fontface.labels=c(2,1),

        bg.labels=c("transparent"),

        align.labels=list(
          c("left", "top"),
          c("center", "center")
        ),

        fontsize.labels=c(30,18),

        fontsize.title = 18,

        border.col=c("black","white"),

        title = paste0(datarange[y0],
                       ". articles per month = ", count_bymonth)
        )

4.7.3b Visualize by month: 2022 (Fig. 5)

Set a target year

y <- 2022
y0 <- "2022"

theta_summary2 <- theta %>% 
  
  dplyr::filter(year == y) %>% 
  
  dplyr::filter(month %in% reference$month[reference$year == y]) %>% 

  complete(month, final_topic, fill = list(z = NA)) %>% 
  
  group_by(month, final_topic, .drop = F) %>% 
  
  summarize(count=sum(!is.na(Art_ID)), .groups = "drop") %>% 
  
  group_by(month) %>% 
  
  mutate(ratio = count/sum(count)) %>% 
  
  drop_na() %>% 
  
  as.data.frame()

count_bymonth <- aggregate(count ~ month, data = theta_summary2, FUN = sum) %>% 
  select(count) %>% as.vector()

Plot by count – treemap

temp_palette <- data.frame(final_topic = topic_level,
                           colors = colorpalette, 
                           topic_label0 = topic_label_new)

theta_summary2 <- left_join(theta_summary2, temp_palette, by = "final_topic")

theta_summary2 <- theta_summary2 %>%
  mutate(month_abb = month.abb[month])


treemap(theta_summary2,

        index = c("month_abb", "topic_label0"),

        vSize = "count",

        type = "color",

        vColor = "colors",

        fontface.labels=c(2,1),

        bg.labels=c("transparent"),

        align.labels=list(
          c("left", "top"),
          c("center", "center")
        ),

        fontsize.labels=c(30,18),

        fontsize.title = 18,

        border.col=c("black","white"),

        title = paste0(datarange[y0],
                       ". articles per month = ", count_bymonth)
        )

4.8 Frequent entities (Table 1)

spacy_11 <- readRDS(paste0("./ProcessedData/N_drought+england_ALL_spacy1.rds"))

topic_subset <- bind_cols(x, theta)



temp_all <- data.frame()

for (t in c(1, 10, 5,6,8,4,2,3,7,9)) {
  topic_subset0 <- topic_subset %>% filter(final_topic %in% paste0("V", t)) 

  spacy_1_topicsubset <- spacy_11 %>% subset(doc_id %in% topic_subset0$doc_id)
  
  # entity 
  spacy_1_topic_entity <- spacy_1_topicsubset[!(spacy_1_topicsubset$entity==""), ]
  
  ## organization
  spacy_1_topic_org <- spacy_1_topic_entity %>% 
    
    subset(entity %in% c("ORG_B")) %>% count(entity_result, sort = TRUE) 
  
  temp <- data.frame("topic" = t,
                     "org" = head(spacy_1_topic_org, n = 10))
  
  temp_all <- rbind(temp_all, temp)
  
  # remove byproducts 
  rm(spacy_1_topicsubset, spacy_1_topic_entity,
     spacy_1_topic_org, temp)
  
}

temp_all
##     topic                    org.entity_result org.n
## 1       1               the_Environment_Agency   274
## 2       1               The_Environment_Agency   122
## 3       1                         Thames_Water   119
## 4       1                                   EA    84
## 5       1                       Southern_Water    73
## 6       1                   Environment_Agency    62
## 7       1                     South_East_Water    59
## 8       1                        Anglian_Water    56
## 9       1                           Government    48
## 10      1                       the_Met_Office    44
## 11     10               the_Environment_Agency    47
## 12     10                       the_Met_Office    47
## 13     10                           Met_Office    35
## 14     10               The_Environment_Agency    27
## 15     10                         Thames_Water    26
## 16     10                       Southern_Water    19
## 17     10                       The_Met_Office    19
## 18     10                     United_Utilities    13
## 19     10 the_Centre_for_Ecology_and_Hydrology    12
## 20     10                             Water_UK    11
## 21      5                         Thames_Water   138
## 22      5                                Ofwat   116
## 23      5               the_Environment_Agency    53
## 24      5                       Southern_Water    44
## 25      5                     South_East_Water    38
## 26      5               The_Environment_Agency    36
## 27      5                           Government    30
## 28      5                               Thames    26
## 29      5                     United_Utilities    23
## 30      5                             Water_UK    23
## 31      6               the_Environment_Agency    79
## 32      6                       the_Met_Office    73
## 33      6                       The_Met_Office    64
## 34      6               The_Environment_Agency    50
## 35      6                           Met_Office    37
## 36      6                         Thames_Water    35
## 37      6                                   EA    32
## 38      6                       Southern_Water    29
## 39      6                   Environment_Agency    18
## 40      6                      Yorkshire_Water    15
## 41      8                         Thames_Water   154
## 42      8                       Southern_Water    82
## 43      8               the_Environment_Agency    69
## 44      8                     South_East_Water    63
## 45      8                           Government    45
## 46      8                       the_Met_Office    44
## 47      8                      Yorkshire_Water    39
## 48      8                          Welsh_Water    38
## 49      8                           Met_Office    37
## 50      8           the_National_Drought_Group    33
## 51      4               the_Environment_Agency    41
## 52      4                         Thames_Water    38
## 53      4                       Southern_Water    18
## 54      4                      Natural_England    17
## 55      4                                 RSPB    17
## 56      4                                Ofwat    13
## 57      4               The_Environment_Agency    11
## 58      4                                  WWF    11
## 59      4                               Labour    10
## 60      4                                Defra     9
## 61      2                                  NFU    38
## 62      2               the_Environment_Agency    19
## 63      2                                   EU    12
## 64      2         the_National_Farmers_'_Union    10
## 65      2                           Government     7
## 66      2                                 Lidl     7
## 67      2                                Shell     7
## 68      2           the_National_Drought_Group     7
## 69      2                                 Bank     6
## 70      2                             Guardian     6
## 71      3                         Thames_Water    11
## 72      3                       Southern_Water    10
## 73      3               the_Environment_Agency    10
## 74      3                     South_East_Water     8
## 75      3                       the_Met_Office     8
## 76      3                                  RHS     6
## 77      3                        Anglian_Water     5
## 78      3                             Hozelock     5
## 79      3                       Veolia_Central     4
## 80      3                    Veolia_South_East     4
## 81      7                              Chelsea    16
## 82      7                                  RHS    15
## 83      7                              Arsenal    12
## 84      7                           Government    10
## 85      7                         Thames_Water    10
## 86      7                            Tottenham     8
## 87      7                                  IMF     7
## 88      7                              Cabinet     6
## 89      7                                 City     6
## 90      7                            West_Brom     6
## 91      9               the_Environment_Agency    11
## 92      9                      The_Independent    10
## 93      9                         Thames_Water     9
## 94      9               The_Environment_Agency     9
## 95      9                                  BGS     8
## 96      9                       the_Met_Office     8
## 97      9                                   EU     7
## 98      9                           Government     7
## 99      9                       Southern_Water     7
## 100     9                                 HSBC     6