10 Puerto Rico tournament index

In this section we consider tournament data from the U.S. Caribbean as a potential predictive index for dolphin abundance in the South Atlantic. Because dolphin migrate through the Caribbean region in the months before arriving in South Atlantic waters, peaks in catch-per-unit effort for fisheries operating in the Caribbean could potentially be indicative of high abundances later in the year in waters downstream of their migration patterns. Tournament data are available from Puerto Rico’s Departamento de Recursos Naturales y Ambientales, División de Pesquería Recreativa y Deportiva, going back to the year 2000. Tournament data were requested and received from the institution in June 2022.

10.1 Upload and clean data set

We first input the data set, parse out the dates to extract month and year, and standardize labeling of months.

# clear workspace
rm(list = ls())

if(!require("dplyr")) install.packages("dplyr")
if(!require("emmeans")) install.packages("emmeans")

library(dplyr)
library(emmeans) # Best for extracting standardized indices

# import data -------------------------------------
d <- read.csv("data/PRDNER-DolTournamentData.csv") 
#apply(d, 2, table)
head(d)

      Date                       Location Total.Number.Particpants
1 May-6-00 Club N\xe1utico de La Parguera                      124
2 May-6-00 Club N\xe1utico de La Parguera                      124
3 May-6-00 Club N\xe1utico de La Parguera                      124
4 May-6-00 Club N\xe1utico de La Parguera                      124
5 May-6-00 Club N\xe1utico de La Parguera                      124
6 May-6-00 Club N\xe1utico de La Parguera                      124
  Total.Number.of.Boats Average.Time.Spent.Fishing Tournament.Duration
1                    31                       10.5                   2
2                    31                       10.5                   2
3                    31                       10.5                   2
4                    31                       10.5                   2
5                    31                       10.5                   2
6                    31                       10.5                   2
   Fish.Type   Fish.name Sex Boarded Bycatch Lenght..mm. Weight..Kg.
1 8835290101 Dolphinfish   F    TRUE   FALSE        1675        9.09
2 8835290101 Dolphinfish   F    TRUE   FALSE        1675        6.81
3 8835290101 Dolphinfish   M    TRUE   FALSE        1675       10.45
4 8835290101 Dolphinfish   F    TRUE   FALSE        1625       10.00
5 8835290101 Dolphinfish   F    TRUE   FALSE        1625       12.27
6 8835290101 Dolphinfish   F    TRUE   FALSE        1625       10.00
  Distance.to.coast Zone
1                      2
2                      2
3                      2
4                      2
5                      2
6                      2

# clean dates and extract month day year ---------
d$Date <- as.character(d$Date)
d$month <- NA
d$day <- NA
d$year <- NA

for (i in 1:nrow(d))  {# M
  d[i, 16:18] <- unlist(strsplit(d$Date[i], "-"))  }
#head(d)

# clean up errors and check outputs ---------------------
d$month <- substr(d$month, 1, 3)
d$month[which(d$month == "Abr")] <- "Apr"
d$month[which(d$month == "Arp")] <- "Apr"

d$day[which(d$day == "November" | d$day == "December")] <- NA
d$year[which(d$year == "November" | d$year == "December")] <- NA

d$day <- as.numeric(d$day)
d$year <- as.numeric(d$year)

# standardize year format
d$year[which(d$year < 83)] <- d$year[which(d$year < 83)] + 2000
d$year[which(d$year < 2000)] <- d$year[which(d$year < 2000)] + 1900

d$mon <- match(d$month, month.abb)

table(d$month)


 Apr  Aug  Dec  Feb  Jan  Jul  Jun  Mar  May  Nov  Oct  Sep 
7723   57  614 2360 2160   35   23 7995 1496 1036 1344  273

table(d$day)


   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
 694  829  792  689  718 1123  618  753  733 1417  493  775 1061  936  278  517 
  17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
 894  859  524 1213  726  875  880  655  661  808  854 1192  850  563 1050

table(d$year)


1983 1984 1985 1986 1987 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 
  80  153    5   53   19  843 1254 1413 1040 1303  944 1411 1418 1209 1190  538 
2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 
1263 1815 1321 1153 1512  916  944  808 1213   51   91 1070

In keeping with the definition of seasons in other parts of the MSE, we group December with the following January and February in a winter season.

# fix so Dec is grouped with following year -----------------
d$mon[which(d$mon == 12)] <- 0.5
d$year2 <- d$year
d$year2[which(d$mon == 0.5)] <- d$year[which(d$mon == 0.5)]+1
rbind(table(d$year), table(d$year2))

     1983 1984 1985 1986 1987 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
[1,]   80  153    5   53   19  843 1254 1413 1040 1303  944 1411 1418 1209 1190
[2,]   80  153    5   53   19  789 1220 1501 1030 1313  944 1411 1418  863 1536
     2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
[1,]  538 1263 1815 1321 1153 1512  916  944  808 1213   51   91 1070
[2,]  538 1263 1815 1321 1153 1512  915  945  801 1190   81   85 1076

#head(d)
#tail(d)

10.2 Viewing the raw data

Let’s take a look at some of the data columns using barplots and histograms.

# look at data columns ----------------------------------

par(mfrow = c(3, 2))
barplot(table(d$mon), main = "Number of observations - Month")
barplot(table(d$year), main = "Number of observations - Year")
barplot(table(d$day), main = "Number of observations - Day")
barplot(table(d$Sex), main = "Number of observations - Sex")
barplot(table(d$Zone), main = "Number of observations - Zone")
barplot(table(d$Bycatch), main = "Number of observations - Bycatch")

par(mfrow = c(2, 2))
hist(d$Total.Number.Particpants, main = "Total number of participants")
hist(d$Total.Number.of.Boats, main = "Total number of boats")
hist(d$Average.Time.Spent.Fishing, main = "Average time spent fishing")
hist(d$Tournament.Duration, main = "Tournament duration")

There are a few NAs in the tournament data columns. We need to consider what details are available and what factors we can feasibly use in the standardization process.

# find NAs in tournament details -------------------
table(is.na(d$Total.Number.of.Boats))


FALSE  TRUE 
24331   785

table(is.na(d$Total.Number.Particpants))


FALSE  TRUE 
24189   927

table(is.na(d$Total.Number.Particpants) & is.na(d$Total.Number.of.Boats))


FALSE  TRUE 
24332   784

table(is.na(d$Average.Time.Spent.Fishing))


FALSE  TRUE 
23290  1826

plot(d$Total.Number.of.Boats, d$Total.Number.Particpants, xlab = "total number of boats", ylab = "total number of participants")
out <- lm(d$Total.Number.Particpants ~ d$Total.Number.of.Boats)
abline(out, col = 8)

The total number of boats is highly correlated with the total number of participant and the linear regression indicates that most boats have 4 participants.

10.3 Data preparation for standardization

In preparation for creating a standardized catch-per-unit-effort (CPUE) index that can serve as a proxy for abundance, we have to specify nuisance factors in the data set that have to be accounted for. We create a unique tournament ID number with a combination of the day and number of participants.

# create unique tournament ID number -------------------------
d$dat <- paste0(d$day, d$month, d$year)
length(unique(paste0(d$day, d$month, d$year, d$Total.Number.Particpants)))

[1] 386

length(unique(paste0(d$day, d$month, d$year, d$Location)))

[1] 387

length(unique(paste0(d$day, d$month, d$year, d$Total.Number.of.Boats)))

[1] 386

length(unique(paste0(d$day, d$month, d$year, d$Tournament.Duration)))

[1] 366

d$datID <- paste0(d$day, d$month, d$year, d$Total.Number.Particpants)

#barplot(table(d$mon))

We remove all cases where dolphin is not listed as the target species; we will use only directed dolphin trips in the standardization. We also specify the months that we want to include in the index of abundance. Here we specify December to MArch With this subset of months, we have 128 unique tournaments in the data base across 121 different dates.

# remove non-target cases; use only directed trips -------------------
dfull <- d
d <- d[which(d$Bycatch == FALSE), ]

# subset by season, e.g. 0 - 4 is December to April -------------------
d <- d[which(d$mon >= 0 & d$mon <= 3), ]

#length(unique(d$Date))
length(unique(d$dat))

[1] 121

length(unique(d$datID))

[1] 128

Next we calculate the total weight by tournament, by summing the reported weights across each tournament ID. We then set up a new data frame where each tournament is a row, and columns represent the attributes of each tournament (year, month, zone, participants, number of boats, fishing time, duration, total weight and total abundance). We also calculate the average weight by dividing total weight by total abundance.

# calculate total by tournament ----------------------------
totWT <- tapply(d$Weight..Kg., d$datID, sum, na.rm = T)
totN <- table(d$datID)
mon <- tapply(d$mon, d$datID, mean, na.rm = T)
year <- tapply(d$year2, d$datID, mean, na.rm = T)

# checks on month and tournament assignments ----------------------
#table(d$year[d$mon == 0.5], d$year2[d$mon == 0.5])
#table(d$year[d$mon != 0.5], d$year2[d$mon != 0.5])

max(tapply(d$Total.Number.Particpants, d$datID, sd), na.rm = T)  # should all be zeros

[1] 0

max(tapply(d$Total.Number.of.Boats, d$datID, sd), na.rm = T)

[1] 0

max(tapply(d$Average.Time.Spent.Fishing, d$datID, sd), na.rm = T)

[1] 0

# assign attributes of tournaments -------------------------------
Npart <- tapply(d$Total.Number.Particpants, d$datID, mean, na.rm = T)
Nboat <- tapply(d$Total.Number.of.Boats, d$datID, mean, na.rm = T)
Ftime <- tapply(d$Average.Time.Spent.Fishing, d$datID, mean, na.rm = T)
dura <- tapply(d$Tournament.Duration, d$datID, mean, na.rm = T)
zone <- tapply(d$Zone, d$datID, mean, na.rm = T)

# new data frame for standardization ----------------------------------
dat <- data.frame(cbind(year, mon, zone, Npart, Nboat, Ftime, dura, totWT, totN))
dat$avwt <- dat$totWT / dat$totN

head(dat)

             year mon zone Npart Nboat Ftime dura   totWT totN     avwt
10Feb2007140 2007   2    1   140    35   8.0    1  347.64   49 7.094694
10Feb200824  2008   2    1    24    15   7.0    1  264.71   33 8.021515
10Jan2009332 2009   1    1   332    83   8.0    2  770.44  147 5.241088
10Jan2014188 2014   1    1   188    47   7.5    2 1479.82  249 5.943052
10Jan2015136 2015   1    1   136    34   7.5    2  794.72  264 3.010303
10Mar2001192 2001   3    2   192    48   8.0    2 2499.33  272 9.188713

There are a number of missing values in this data frame. For fishing time and number of boats, we impute the median of the known values into the missing values. For missing values for the number of boats, we take the number of participants and divide by 4 (given the relationship established above). If both values are missing, we impute the median into the unknown values.

# fill in NAs ---------------------------------------------------
dat$Nboat[which(dat$Nboat == 0)] <- NA
dat$Npart[which(dat$Npart == 0)] <- NA
dat$Ftime[which(dat$Ftime == 0)] <- NA

#hist(dat$Ftime)
which(is.na(dat$Ftime))

[1]  15  18  68  72  86  94 101 127 128

dat$Ftime[is.na(dat$Ftime)] <- median(dat$Ftime, na.rm = T)

#hist(dat$Nboat)
which(is.na(dat$Nboat))

[1]  15  83  94 122 128

which(is.na(dat$Npart))

[1]  11  12  15  94 101 110 122 128

dat$Npart[is.na(dat$Nboat)]

[1] NaN  33 NaN  NA NaN

#plot(dat$Nboat, dat$Npart)
#dat$Npart / dat$Nboat
dat$Nboat[is.na(dat$Nboat)] <- dat$Npart[is.na(dat$Nboat)] / 4
#dat$Npart[is.na(dat$Nboat)]
dat$Nboat[is.na(dat$Nboat)] <- median(dat$Nboat, na.rm = T)

which(is.na(dat$Nboat))

integer(0)

which(is.na(dat$Ftime))

integer(0)

which(is.na(dat$totN))

integer(0)

Finally, we subset the years. Data are very sparse prior to 2000 so we only include the data for 2000 and forward. We calculate the catch-per-unit-effort, defining catch as the total abundance, and effort as the number of boats multiplied by the average time spent fishing.

# subset years -----------------------------------

yrs <- 2000:2020

dat[is.na(dat)] <- NA
dat <- dat[which(dat$year >= min(yrs)), ]
dat <- dat[which(dat$year <= max(yrs)), ]

# calculate CPUE -----------------------------------
dat$eff <- dat$Nboat * dat$Ftime   # dat$dura
dat$cpue <- dat$totN / dat$eff
dat$cpueW <- dat$totWT / dat$eff

which(dat$cpue == "Inf")

integer(0)

which(dat$cpueW == "Inf")

integer(0)

#dat$cpue[which(dat$cpue == "Inf")] <- NA

barplot(table(dat$year), las = 2, main = "Number of tournaments per year")

The plot shows the number of tournaments per year. We can see that there are relatively few events for each year.

10.4 Analyze the nominal CPUE trends

Now using this clean data set we can look at the nominal (average) CPUE by year. We first look at the average weight across all tournaments, and weight by the northern and southern coasts. The weights appear to be highly variable across year and coasts.

# look at nominal trends ----------------------------
par(mfrow = c(3, 1), mar = c(3, 5, 3, 1))
barplot(tapply(dat$avwt, dat$year, mean, na.rm = T), las = 2, 
        main = "Average weight of tournament-caught dolphinfish by year", 
        ylab = "average weight (kg)")

d2 <- dat[which(dat$zone == 2), ]
barplot(tapply(d2$avwt, d2$year, mean, na.rm = T), las = 2, 
        main = "Average weight of tournament-caught dolphinfish by year\n(Southern coast)", 
        ylab = "average weight (kg)")

d2 <- dat[which(dat$zone != 2), ]
barplot(tapply(d2$avwt, d2$year, mean, na.rm = T), las = 2, 
        main = "Average weight of tournament-caught dolphinfish by year\n(other coasts)", 
        ylab = "average weight (kg)")

# look at nominal trends by year, month and zone
barplot(tapply(dat$cpue, dat$year, mean, na.rm = T), las = 1, 
         main = "Average CPUE of tournament-caught dolphinfish by year", 
        xlab = "year", ylab = "CPUE")
barplot(tapply(dat$cpue, dat$mon, mean, na.rm = T), las = 1,  
        main = "Average CPUE of tournament-caught dolphinfish by month", 
        xlab = "month", ylab = "CPUE")
barplot(tapply(dat$cpue, dat$zone, mean, na.rm = T), las = 1, 
         main = "Average CPUE of tournament-caught dolphinfish by zone", 
        xlab = "zone", ylab = "CPUE")

par(mfrow = c(1, 1), mar = c(5, 5, 1, 1))
plot(tapply(dat$cpue, dat$year, mean, na.rm = T), tapply(dat$cpueW, dat$year, mean, na.rm = T), 
     xlab = "nominal CPUE by number", ylab = "nominal CPUE by weight")

cor(tapply(dat$cpue, dat$year, mean, na.rm = T), tapply(dat$cpueW, dat$year, mean, na.rm = T))

[1] 0.8745617

There appears to be high variability in the CPUE by year. The highest CPUE appears to occur in December and March. CPUE also varies by zone, the Southern coast having the highest catch rates. CPUE based on number and weight are highly correlated.

10.5 Calculate standardized CPUE trends

Now we will carry out the standardization, using day, month and zone as standardization factors. Since the catch rates are highly skewed, we carry out the standardization based on the log CPUE.

# convert variables to factors ----------------------
dat$year <- as.factor(dat$year)
dat$mon <- as.factor(dat$mon)
dat$zone <- as.factor(dat$zone)

# standardization -----------------------------------

out <- glm(log(dat$cpue) ~ dat$year + dat$mon + dat$zone, 
                    family = gaussian)
summary(out)


Call:
glm(formula = log(dat$cpue) ~ dat$year + dat$mon + dat$zone, 
    family = gaussian)

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept)  -1.41897    1.00202  -1.416    0.160
dat$year2001  0.29413    0.84273   0.349    0.728
dat$year2002  0.49382    0.85858   0.575    0.567
dat$year2003 -0.51506    0.85809  -0.600    0.550
dat$year2004 -0.13450    0.85593  -0.157    0.875
dat$year2005 -0.17719    0.93153  -0.190    0.850
dat$year2006  0.04658    0.88927   0.052    0.958
dat$year2007  0.84960    0.86556   0.982    0.329
dat$year2008  0.15122    0.85899   0.176    0.861
dat$year2009  0.56933    0.85373   0.667    0.507
dat$year2010 -0.08845    0.86578  -0.102    0.919
dat$year2011  0.47062    0.87309   0.539    0.591
dat$year2012  0.84258    0.86576   0.973    0.333
dat$year2013  0.34104    0.89906   0.379    0.705
dat$year2014  0.76659    0.88318   0.868    0.388
dat$year2015  1.17262    0.88874   1.319    0.190
dat$year2016 -0.85525    0.85299  -1.003    0.319
dat$year2017 -0.08506    0.85856  -0.099    0.921
dat$year2018  0.57657    1.17003   0.493    0.623
dat$year2019  0.30635    0.88178   0.347    0.729
dat$year2020 -0.65522    0.91964  -0.712    0.478
dat$mon1     -0.32315    0.39481  -0.819    0.415
dat$mon2     -0.06517    0.38303  -0.170    0.865
dat$mon3     -0.13434    0.40835  -0.329    0.743
dat$zone1     0.13014    0.47370   0.275    0.784
dat$zone2     0.70758    0.50349   1.405    0.163
dat$zone3     0.38841    0.56139   0.692    0.491
dat$zone4    -0.13385    0.48181  -0.278    0.782

(Dispersion parameter for gaussian family taken to be 0.6288276)

    Null deviance: 101.331  on 119  degrees of freedom
Residual deviance:  57.852  on  92  degrees of freedom
AIC: 310.99

Number of Fisher Scoring iterations: 2

anova(out)

Analysis of Deviance Table

Model: gaussian, link: identity

Response: log(dat$cpue)

Terms added sequentially (first to last)

         Df Deviance Resid. Df Resid. Dev      F   Pr(>F)   
NULL                       119    101.331                   
dat$year 20   32.476        99     68.855 2.5823 0.001182 **
dat$mon   3    2.382        96     66.473 1.2628 0.291846   
dat$zone  4    8.620        92     57.852 3.4272 0.011697 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# estimated coefficients from standardization
ind <- emmeans(out, "year", type = "response") %>% as.data.frame()
names(ind)[1] <- "Year"

Finally, we compare the standardized CPUE with the nominal CPUE. The two time series are somewhat closely related, indicating similar years of low and high relative abundance.

# look at nominal vs standardized index ------------------------
plot(yrs, ind$response, type = "l", lwd = 2, col = 4, main = "Nominal versus standardized CPUE from dolphin tournament data",
     xlab = "year", ylab = "CPUE (fish / (boats x time))", ylim = c(0, 1.8))
points(yrs, ind$response, pch = 1, lwd = 2, col = 4) 
lines(yrs, ind$lower.CL, col = 4, lty = 2)
lines(yrs, ind$upper.CL, col = 4, lty = 2)
lines(yrs, tapply(dat$cpue, dat$year, mean, na.rm = T) *1.0, col = 2, lwd = 2)
points(yrs, tapply(dat$cpue, dat$year, mean, na.rm = T) *1.0, col = 2, lwd = 2, pch = 1)
legend("topleft", c("nominal", "standardized"), lwd = 2, pch = 19, col = c(2, 4), bty = "n")

cor(ind$response, tapply(dat$cpue, dat$year, mean, na.rm = T))

[1] 0.9064402

Finally, let’s look at how our standardized index compares to the South Atlantic landings. We experimented with CPUE in both units of abundance and weight, and for different combinations of winter months (e.g., Dec - Feb, Jan - Apr). The highest correlation with South Atlantic landings occurs with tournament data from December to March with CPUE based on abundance. In this case the tournament data describes 18% of the variation in South Atlantic landings, with 2015 as a well above-average outlier in both the tournament CPUE and the landings.

rec <- read.csv("data/recLandings.csv")
names(rec)[1] <- "Year"
rec$ATL <- rowSums(rec[3:6], na.rm = T)

d <- merge(rec, ind, by = "Year")

par(mar = c(4, 6, 2, 1))
plot(d$response, d$ATL/10^6, col = 0, xlab = "standardized tournament CPUE", 
     ylab = "total U.S. Atlantic coast recreational landings\n(millions of pounds)")
text(d$response, d$ATL/10^6, d$Year, col = 1)
out <- lm(d$ATL/10^6 ~ d$response)
abline(out, col = 8)
summary(out)


Call:
lm(formula = d$ATL/10^6 ~ d$response)

Residuals:
   Min     1Q Median     3Q    Max 
-4.904 -2.455 -1.423  2.822  8.784 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   11.968      1.892   6.326 4.52e-06 ***
d$response    10.716      4.618   2.320   0.0316 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.819 on 19 degrees of freedom
Multiple R-squared:  0.2208,    Adjusted R-squared:  0.1798 
F-statistic: 5.385 on 1 and 19 DF,  p-value: 0.0316

r2 <- summary(out)$adj.r.squared
p_val <- summary(out)$coefficients[2, 4]
p_display <- ifelse(p_val < 0.001, "p < 0.001", paste("p =", round(p_val, 3)))
legend("bottomright", 
       legend = bquote(R^2 == .(round(r2, 2)) ~ "; " ~ p == .(round(p_val, 3))),
       bty = "n", cex = 1.2, text.col = 4)

# output the index
write.csv(ind, file = "indices/PRtournament_index.csv", row.names = F)