Background

California coastal poll analysis and introduction to R for Coastal and Marine Management class at CSUCI 2019-11.

Notes

Question 7e Climate change …Thomas Fire?

Wide vs long

  • What uniquely describes an observation?
  • Relationships with joins and relational database tables.

Data types

  • numeric
  • character
  • date
  • factor
  • list

Import Data

# libraries ----
library(glue)
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(scales)
library(plotly)
library(DT)

# variables ----
id         <- "1hH68SqNsvAASFn25-X9ssJPkcGSS-SfS6n3zui9hYOQ"
gid        <- "1739121823"
row_beg    <- 4     # first row above data with headers
row_end    <- 1732  # last row of data before summarizing
reload_url <- FALSE

# paths ----
url_csv  <- glue("https://docs.google.com/spreadsheets/d/{id}/export?format=csv&gid={gid}")
all_csv  <- "data/poll_all.csv"
log_csv  <- "data/poll_log.csv"

# download google sheet ----
if (reload_url | !file.exists(all_csv)){
  download.file(url_csv, all_csv)
}

# read headers and data ----
nrows   <- row_end - row_beg

data    <- read_csv(
  all_csv, skip = row_beg - 1, n_max = nrows, guess_max = nrows)

headers <- read_csv(
  all_csv, skip = 0, n_max = row_beg, 
  col_names = F, col_types = strrep("c", ncol(data)))
names(headers) <- names(data)
headers <- headers %>% 
  bind_cols(
    tibble(row = c("question_n", "heading", "question", "answer")))
  
# questions with answers ---
q_a <- headers %>%
  gather(column, value, -row) %>%
  mutate(icol = sapply(1:ncol(data), rep, 4) %>% as.vector()) %>% 
  spread(row, value) %>% 
  arrange(icol) %>%
  fill(question_n) %>% 
  filter(!is.na(answer)) %>%
  mutate(
     question_n = str_remove(question_n, "Question ")) %>% 
  select(question_n, question, heading, answer, icol, column)
#View(q_a)

# questions ----
q <- q_a %>% 
  group_by(question_n) %>% 
  summarize(
    icol     = first(icol),
    question = first(question),
    heading  = first(heading),
    answers  = paste(answer, collapse = " | ")) %>% 
  arrange(icol) %>% 
  select(question_n, question, heading, answers, icol)
# View(q)

# values ----
v <- data %>% 
  filter(str_sub(UID, 1, 8) != "Example ") %>% 
  mutate(UID = as.integer(UID)) %>% 
  gather(column, value, -UID)

# setup list object to pass between functions ----
poll <- list(q=q, q_a=q_a, v=v)

# helper functions ----
question_title <- function(poll, q_n){
  q_i <- filter(poll$q, question_n == q_n)
  glue("{q_i$question_n}. {q_i$question}")
}

plot_question <- function(poll, q_n, answers_rm = c("answered", "comment"), q_rng = c("13")){
  # poll <- list(q=q, q_a=q_a, v=v); q_n <- "7e"; answers_rm = c("answered", "comment")
  # q_n <- "1" # q_n <- "5" # q_n <- "9" # q_n <- "13"
  q_i     <- filter(poll$q, question_n == q_n)
  q_title <- question_title(poll, q_n)
  
  answer_levels <- q_i$answers %>% 
    str_split(fixed(" | "), simplify = T) %>% 
    as.character() %>% 
    setdiff(answers_rm)
  
  if (length(answer_levels) == 0){
    msg <- glue("ERR length(answer_levels) == 0: {q_title}")
    message(msg); return(msg)
    # browser()
  }
  
  v_i <- v %>% 
    left_join(q_a, by="column") %>% 
    filter(question_n == q_n) %>% 
    filter(answer %in% answer_levels) %>% 
    select(UID, answer, value) %>% 
    mutate(
      value  = as.integer(value),
      answer = factor(answer, answer_levels, ordered = T)) %>% 
    filter(!is.na(value))
  
  if (any(na.omit(v_i$value > 1))){
    
    # log questions with values greater than 1
    d_gt1 <- v_i %>% filter(value > 1)
    log_gt1_tbl <- tibble(
      question  = q_title,
      n_values  = nrow(d_gt1),
      condition = "value > 1",
      first5    = glue(
        "UID:   {paste(head(d_gt1$UID,    5), collapse=', ')}; 
        answer: {paste(head(d_gt1$answer, 5), collapse=', ')}; 
        value:  {paste(head(d_gt1$value,  5), collapse=', ')}"))
    if (file.exists(log_csv)){
      write_csv(log_gt1_tbl, log_csv, append=T)
    } else {
      write_csv(log_gt1_tbl, log_csv)
    }
      
    p <- ggplot(v_i, aes(x = answer, y = value, fill = answer)) + 
      geom_boxplot() + 
      theme(legend.position = "none") +
      coord_flip() + 
      ggtitle(q_title) + 
      theme(axis.title = element_blank())
    
  } else {
    
    v_i <- v_i %>% 
      group_by(answer) %>% 
      summarize(
        n   = sum(value, na.rm = T)) %>% 
      mutate(
        pct = n / sum(n))
    
    p <- ggplot(v_i, aes(x = answer, y = pct, fill = answer, label = percent(pct))) + 
      geom_col(position = 'dodge') + 
      geom_text(size = 3) +
      scale_y_continuous(labels = percent) +
      theme(legend.position = "none") +
      coord_flip() + 
      ggtitle(q_title) + 
      theme(axis.title = element_blank())
  }
  print(p)
}

# show questions ----
datatable(q %>% select(-icol), rownames=F)

Poll Answers

# remove questions that are not easy bar charts or box plots
q_n_rm <- c("UID", "Meta", "1")
q_ns <- setdiff(poll$q$question_n, q_n_rm)

# loop over questions
unlink(log_csv) # clear log
for(q_n in q_ns) { # q_n = q_ns[1]
  cat("\n\n###",  question_title(poll, q_n), "\n")
  plot_question(poll, q_n)
}

2. 1969 event…

3. Know of wetland?

4. Wetland change

5. Wetland % change

6. Climate change problem?

7a. Climate change…home prices?

7b. Climate change…poor air quality?

7c. Climate change…fires of F 2018?

7d. Climate change …2017 hurricanes?

7e. Climate change …Thomas Fire?

7f. Climate change …CA drought?

7g. Climate change …coastal erosion?

8. Taking too many fish?

9. Avoided going to beaches?

10. Coast $ this Week

11. Coastal resource influences

12a. Safe in 2000 Parks?

12b. Safe in 2015 Parks?

12c. Safe in 2019 Parks?

13. Rank threats to coastal

14. Age 1st to beach

15. Source of news

16. Good mgmnt of Coast?

17. CA ocean health better?

18. Heard of institutions?

19. CA fisheries health better?

20. Purchasing seafood

21. Seafood eating

22. Seafood Oz.

23a. Gulf of Mexico seafood safe?

23b. Santa Barbara seafood safe?

23c. Non-SB California seafood safe?

23d. Alaska seafood safe?

23e. Japan seafood safe?

23f. China seafood safe?

23g. Norway seafood safe?

23h. Thailand seafood safe?

24. Valuation of nature

25. Endangered Sp laws

26. Last natural area visited

27. Driving Distance to Beach

28a. Visit natural areas

28b. Visit SoCal Beaches

28c. Visit Channel Islands?

28d. Visit National Parks

29. Channel Islands visited

30. Fires serious impact?

31. Fishing Regulation Level

32a. Malibu rebuilt (post-Woolsey)?

32b. Paradise rebuilt (post-Camp)?

32c. Ventura rebuilt (post-Thomas)?

32d. New Orleans rebuilt?

32e. Jersey Shore rebuilt?

32f. Puerto Rico rebuilt?

32g. Bahamas rebuilt?

33. Coastal Gov’t?

34. Drone Use

35. Operated Drone

36. Heard of the following in relation to seafood

37. Fishing Regulation Types

38. Economic System

39. CA Offshore oil drilling?

40. Coastal Management after 1969 SB Oil Spill

41. Heard about Plastic Pollution?

42. Considered moving

43a. Beach Nourishment

43b. Homeless

43c. Wildlife Crossings

43d. Exiting Paris Accord

43e. Cultivate Marijuana

43f. Utilities Wildfire Liability

43g. Reduce Nat’l Monuments

43h. Plastic bag bans

43i. CI Nat’l Park estab

44a. Joint Resonse to Deepwater Horizon Oil Spill

44b. Joint Resonse to Refugio Oil Spill

44c. Joint Resonse to 1969 SB Oil Spill

45. Zip Code

46. Years at Zip

47. Occupation

48. Year Born

49. Household Income

50. Voter

51. Gender

52. Education

53. Language

54. Env Issues

Log check

Let’s look at the log output where values > 1 were detected, which forced the graphing into a boxplot to assume an average value over a range. However where only a few values were > 1, these values might need to be just a 1 or 0 to reflect a Yes or No value to be summed.

read_csv(log_csv) %>% 
  arrange(n_values) %>% 
  datatable()