Mr. Whitworth, Mr. Tennenbaum, thank you for the opportunity to work with Anheuser Busch InBev to analyze data on a set of craft beers and the breweries that make them. We’ve received the datasets containing information on over 2400 craft beers made by over 550 breweries.
We’ve begun to explore these datasets with the intention of answering the questions you previously provided. I believe we’ve been able to extract insights that you will find valuable to Anheuser Busch InBev.
With that, if you are ready, let’s walk through our initial findings within the data.
library(dplyr)
library(ggplot2)
library(tidyr)
#Read in Breweries.csv and setup a list with the state and count of each state, then build a barplot
Breweries <- read.csv("Breweries.csv", header = TRUE , ",")
totals <- Breweries %>% count (State)
q1 <- ggplot(data=totals, aes(x = State,y=n, fill = n, width = 0.8)) + geom_bar(stat='identity', width=0.7) + scale_fill_gradient2(midpoint=mean(totals$n), low= '#971B1E', mid = '#E02124', high = '#FFF200', space='#E1E1E1') + geom_text(aes(x= State,n +1, label = paste(State, n, sep=' - ')), data = totals, position = position_dodge(width = 1), hjust = -0.1, size = 4) + ggtitle("Total Breweries in each State") + coord_flip() + theme(legend.title = element_text(colour='#331D0C')) + labs(fill="Count of Breweries", y = "Count of Breweries") + scale_y_continuous(limits = c(0,max(totals$n) *1.1))
q1
Beers <- read.csv("Beers.csv", header = TRUE , ",")
BeerN <- rename(Beers,"Brew_ID" = "Brewery_id")
tdf <- merge(BeerN,Breweries, by = "Brew_ID", na.rm = True) %>% rename(Beer = Name.x, Brewery = Name.y)
#View(tdf)
head(tdf,n=6)
## Brew_ID Beer Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
tail (tdf,n=6)
## Brew_ID Beer Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
###EXTRA
top_n(tdf,6,Brew_ID)
## Brew_ID Beer Beer_ID ABV IBU Style
## 1 556 Pilsner Ukiah 98 0.055 NA German Pilsener
## 2 557 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen
## 3 557 Snapperhead IPA 51 0.068 NA American IPA
## 4 557 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout
## 5 557 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA)
## 6 558 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale
## Ounces Brewery City State
## 1 12 Ukiah Brewing Company Ukiah CA
## 2 12 Butternuts Beer and Ale Garrattsville NY
## 3 12 Butternuts Beer and Ale Garrattsville NY
## 4 12 Butternuts Beer and Ale Garrattsville NY
## 5 12 Butternuts Beer and Ale Garrattsville NY
## 6 12 Sleeping Lady Brewing Company Anchorage AK
colSums(is.na(tdf))
## Brew_ID Beer Beer_ID ABV IBU Style Ounces Brewery City State
## 0 0 0 62 1005 0 0 0 0 0
#Setup DF where we'll strip out missing data
dtdf_RemoveNA_ABV <- tdf
dtdf_RemoveNA_IBU <- tdf
dtdf <- tdf
### assigning to new data frame
dtdf_RemoveNA_ABV <- dtdf_RemoveNA_ABV[!is.na(tdf$ABV), ]
dtdf_RemoveNA_IBU <- dtdf_RemoveNA_IBU [!is.na(tdf$IBU), ]
dtdf <- dtdf[!is.na(dtdf$ABV), ]
dtdf <- dtdf[!is.na(dtdf$IBU), ]
colSums(is.na(dtdf_RemoveNA_ABV))
## Brew_ID Beer Beer_ID ABV IBU Style Ounces Brewery City State
## 0 0 0 0 943 0 0 0 0 0
colSums(is.na(dtdf_RemoveNA_IBU))
## Brew_ID Beer Beer_ID ABV IBU Style Ounces Brewery City State
## 0 0 0 0 0 0 0 0 0 0
colSums(is.na(dtdf))
## Brew_ID Beer Beer_ID ABV IBU Style Ounces Brewery City State
## 0 0 0 0 0 0 0 0 0 0
df1 <- dtdf_RemoveNA_ABV %>% group_by(State) %>% summarise(Median_ABV = median(ABV))
ggplot(data = df1, aes(x= State, y = Median_ABV * 100 ,fill = Median_ABV))+geom_bar(stat='identity', width=0.7) + ggtitle ("State v.Median_ABV") + scale_fill_gradient2(midpoint=mean(df1$Median_ABV), low= '#971B1E', mid = '#E02124', high = '#FFF200', space='#E1E1E1') + coord_flip() + geom_text(data = df1, aes(x = State, label = paste(State, paste(Median_ABV * 100, '%', sep = ''), sep=' - ')), position = position_dodge(width = 1), hjust = -0.1, size = 4) + scale_y_continuous(limits = c(0,max(df1$Median_ABV * 100) *1.1))
## question how can I do both at the same time
df2 <- dtdf_RemoveNA_IBU %>% group_by(State) %>% summarise(Median_IBU = median(IBU))
ggplot(data = df2, aes(x= State, y = Median_IBU ,fill = Median_IBU))+geom_bar(stat='identity', width=0.7)+ ggtitle ("State v.Median_IBU") + scale_fill_gradient2(midpoint=mean(df2$Median_IBU), low= '#971B1E', mid = '#E02124', high = '#FFF200', space='#E1E1E1') + coord_flip() + geom_text(data = df2, aes(x = State, label = paste(State, Median_IBU, sep=' - ')), position = position_dodge(width = 1), hjust = -0.1, size = 4) + scale_y_continuous(limits = c(0,max(df2$Median_IBU) *1.1))
#row number with of max value can be obtained by which.max, to get all information using by subsetting data frame.
df1[which.max(df1$Median_ABV),] ###Kentucky and DC
## # A tibble: 1 x 2
## State Median_ABV
## <chr> <dbl>
## 1 " DC" 0.0625
dtdf_RemoveNA_ABV[which.max(dtdf_RemoveNA_ABV$ABV),]
## Brew_ID Beer Beer_ID ABV
## 375 52 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128
## IBU Style Ounces Brewery City State
## 375 NA Quadrupel (Quad) 19.2 Upslope Brewing Company Boulder CO
#IF we need to show the single highest ABV and IBU beers
#tail(arrange(dtdf, ABV))
df2[which.max(df2$Median_IBU),] ###Maine
## # A tibble: 1 x 2
## State Median_IBU
## <chr> <dbl>
## 1 " ME" 61
dtdf_RemoveNA_IBU[which.max(dtdf_RemoveNA_IBU$IBU),]
## Brew_ID Beer Beer_ID ABV IBU
## 1857 375 Bitter Bitch Imperial IPA 980 0.082 138
## Style Ounces Brewery City
## 1857 American Double / Imperial IPA 12 Astoria Brewing Company Astoria
## State
## 1857 OR
summary(dtdf$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02700 0.05000 0.05700 0.05991 0.06800 0.12500
sd(dtdf_RemoveNA_ABV$ABV*100) ###0.005437061
## [1] 1.354173
mean(dtdf_RemoveNA_ABV$ABV*100) ### 0.05564
## [1] 5.977342
ggplot(dtdf_RemoveNA_ABV, aes(x = ABV*100)) + geom_histogram(aes(y = ..count..), fill = '#971B1E') + scale_y_continuous(name = "Frequency") + theme(legend.position = 'none') + scale_x_continuous(name = 'Median ABV (%)', breaks = seq(2,13,.5)) + geom_vline(aes(xintercept = mean(dtdf_RemoveNA_ABV$ABV *100)),col='#FFF200',size=1) + geom_vline(aes(xintercept = median(dtdf_RemoveNA_ABV$ABV *100)),col='#E02124',size=1)
dtdf %>% ggplot(aes(x=ABV,y= IBU)) + geom_point() + geom_smooth (formula = y~ x,method="lm",col='#E02124',size=1) + ggtitle(("Relationship between ABV and IBU"))
outliers <- dtdf[dtdf$ABV > 0.075, ]
outliers <- outliers[outliers$IBU < 25, ]
outliers['Style']
## Style
## 178 Scotch Ale / Wee Heavy
## 199 Quadrupel (Quad)
## 225 Dubbel
## 491 Märzen / Oktoberfest
## 590 Fruit / Vegetable Beer
## 1266 American Strong Ale
## 1491 Scottish Ale
## 1645 Witbier
## 1762 Tripel
## 2188 Scottish Ale
library(dplyr)
library(class)
library(caret)
library(stringr)
tdf <- merge(BeerN,Breweries, by = "Brew_ID", na.rm = True) %>% rename(Beer = Name.x, Brewery = Name.y)
TD <- tdf %>% filter(grepl('IPA|Ale',Style,ignore.case = T))
TD = TD[!is.na(TD$ABV),]
TD= TD[!is.na(TD$IBU), ]
#dim(TD)
#get just the 3 relevant columns and get rid of the rest
TDSubset <- select(TD, ABV, IBU, Style)
#make style a character from factor
TDSubset$Style <- as.character(TDSubset$Style)
#make all IPA's have Style of IPA and all ales have simply ale - eliminates all of the additional types to act on them as all the same In response to Novin'ssuggestion to Ranjan, changing to 0 = ipa and 1 = ale
for (i in 1:nrow(TDSubset)) {
if (is.na(str_match(TDSubset[i, 3], 'Ale'))) {
TDSubset[i,3] <- 0
}
else {
TDSubset[i,3] <- 1
}
}
head(TDSubset)
## ABV IBU Style
## 1 0.045 50 0
## 2 0.048 19 1
## 3 0.060 38 1
## 4 0.080 68 0
## 5 0.042 42 1
## 6 0.066 21 1
#take 70% of the TDSubset to use as train and the remainder as test
ind = sample((1:nrow(TDSubset)),.7*nrow(TDSubset))
train = TDSubset[ind,]
test = TDSubset[-ind,]
pred = c("ABV","IBU")
#run 50 iterations to test the best value of K to test the best accuracy
bestKResults <- data.frame(Acc = numeric(100), k = numeric(100))
for (i in 1:50) {
class <- knn(train[,pred],test[,pred],train$Style , prob= TRUE, k= i)
conMatrix <- confusionMatrix(table(class, test$Style))
bestKResults$Acc[i] <- conMatrix$overall[1]
bestKResults$k[i] <- i
}
ggplot(bestKResults, aes(x=k, y = Acc)) + geom_line()+ labs(x = "K Value", y = "Accuracy")
classifications = knn(train[,pred],test[,pred],train$Style , prob= TRUE, k= 17)
confusionMatrix(table(classifications, test$Style))
## Confusion Matrix and Statistics
##
##
## classifications 0 1
## 0 83 15
## 1 39 153
##
## Accuracy : 0.8138
## 95% CI : (0.7641, 0.8569)
## No Information Rate : 0.5793
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6074
##
## Mcnemar's Test P-Value : 0.001749
##
## Sensitivity : 0.6803
## Specificity : 0.9107
## Pos Pred Value : 0.8469
## Neg Pred Value : 0.7969
## Prevalence : 0.4207
## Detection Rate : 0.2862
## Detection Prevalence : 0.3379
## Balanced Accuracy : 0.7955
##
## 'Positive' Class : 0
##
#create df for each lager, ipa, and apa, then remove all na IBU which will also remove any na abv as all na abv also have missing ibu
lagerDF <- tdf %>% filter(grepl('lager',Style,ignore.case = T))
ipaDF <- tdf %>% filter(grepl('IPA',Style,ignore.case = T))
paleAleDF <- tdf %>% filter(grepl('american pale ale',Style,ignore.case = T))
lagerDF <- lagerDF[!is.na(lagerDF$IBU), ]
ipaDF <- ipaDF[!is.na(ipaDF$IBU), ]
paleAleDF <- paleAleDF[!is.na(paleAleDF$IBU), ]
#Find the max, min, and median ABV and IBU for each of the top 3 beer styles
lagerDF[which.max(lagerDF$IBU),]
## Brew_ID Beer Beer_ID ABV IBU Style Ounces
## 2 3 Excess IPL 2671 0.072 80 American India Pale Lager 16
## Brewery City State
## 2 Jack's Abby Craft Lagers Framingham MA
lagerDF[which.min(lagerDF$IBU),]
## Brew_ID Beer Beer_ID ABV IBU Style Ounces
## 33 130 American Lager 2233 0.041 8 American Adjunct Lager 12
## Brewery City State
## 33 Straub Brewery St Mary's PA
lagerDF[which.max(lagerDF$ABV),]
## Brew_ID Beer Beer_ID ABV IBU Style Ounces
## 2 3 Excess IPL 2671 0.072 80 American India Pale Lager 16
## Brewery City State
## 2 Jack's Abby Craft Lagers Framingham MA
lagerDF[which.min(lagerDF$ABV),]
## Brew_ID Beer Beer_ID ABV IBU Style Ounces Brewery
## 35 130 American Light 2231 0.032 13 Light Lager 12 Straub Brewery
## City State
## 35 St Mary's PA
medianLagerIBU <- lagerDF %>% summarise(Median_IBU = median(IBU))
medianLagerABV <- lagerDF %>% summarise(Median_ABV = median(ABV))
medianLagerABV$Median_ABV
## [1] 0.049
medianLagerIBU$Median_IBU
## [1] 19
ipaDF[which.max(ipaDF$IBU),]
## Brew_ID Beer Beer_ID ABV IBU
## 462 375 Bitter Bitch Imperial IPA 980 0.082 138
## Style Ounces Brewery City State
## 462 American Double / Imperial IPA 12 Astoria Brewing Company Astoria OR
ipaDF[which.min(ipaDF$IBU),]
## Brew_ID Beer Beer_ID ABV IBU
## 277 200 Lights Out Vanilla Cream Extra Stout 1513 0.077 30
## Style Ounces Brewery City State
## 277 American Double / Imperial IPA 12 Worthy Brewing Company Bend OR
ipaDF[which.max(ipaDF$ABV),]
## Brew_ID Beer Beer_ID ABV IBU Style
## 40 25 Hopkick Dropkick 2471 0.099 115 American Double / Imperial IPA
## Ounces Brewery City State
## 40 12 Burn 'Em Brewing Michigan City IN
ipaDF[which.min(ipaDF$ABV),]
## Brew_ID Beer Beer_ID ABV IBU Style Ounces
## 66 35 Even Keel 2105 0.038 40 American IPA 12
## Brewery City State
## 66 Ballast Point Brewing Company San Diego CA
medianIPAIBU <- ipaDF %>% summarise(Median_IBU = median(IBU))
medianIPAABV <- ipaDF %>% summarise(Median_ABV = median(ABV))
medianIPAABV$Median_ABV
## [1] 0.068
medianIPAIBU$Median_IBU
## [1] 70
paleAleDF[which.max(paleAleDF$IBU),]
## Brew_ID Beer Beer_ID ABV IBU Style
## 69 136 Over the Rail Pale Ale 711 0.057 68 American Pale Ale (APA)
## Ounces Brewery City State
## 69 12 Pug Ryan's Brewery Dillon CO
paleAleDF[which.min(paleAleDF$IBU),]
## Brew_ID Beer Beer_ID ABV IBU Style
## 165 330 Bronx Summer Pale Ale 1748 0.052 16 American Pale Ale (APA)
## Ounces Brewery City State
## 165 16 The Bronx Brewery Bronx NY
paleAleDF[which.max(paleAleDF$ABV),]
## Brew_ID Beer Beer_ID ABV IBU Style Ounces
## 101 169 The Power of Zeus 2294 0.07 68 American Pale Ale (APA) 12
## Brewery City State
## 101 High Hops Brewery Windsor CO
paleAleDF[which.min(paleAleDF$ABV),]
## Brew_ID Beer Beer_ID ABV IBU
## 210 449 Back in the Saddle Rye Pale Ale 1047 0.037 53
## Style Ounces Brewery City State
## 210 American Pale Ale (APA) 12 Mavericks Beer Company Half Moon Bay CA
medianAPAIBU <- paleAleDF %>% summarise(Median_IBU = median(IBU))
medianAPAABV <- paleAleDF %>% summarise(Median_ABV = median(ABV))
medianAPAABV$Median_ABV
## [1] 0.055
medianAPAIBU$Median_IBU
## [1] 44
regions <- read.csv("states_by_region.csv", header = TRUE , ",")
#regions has State.Code
#ipaDF has State
#need to mutate to join State.Code to ipaDF
regions <- regions %>% rename(StateName = State, State = State.Code)
regions$State <- trimws(as.character(regions$State), which = c("both"), whitespace = "[ \t\r\n]")
ipaDF$State <- trimws(as.character(ipaDF$State), which = c("both"), whitespace = "[ \t\r\n]")
#head(regions)
#head(ipaDF)
ipaDF <- merge(x=ipaDF,y=regions, by = "State", all.x=TRUE)
head(ipaDF)
## State Brew_ID Beer Beer_ID ABV IBU Style
## 1 AK 103 King Street IPA 1667 0.060 70 American IPA
## 2 AK 224 Pleasure Town 2093 0.063 61 American IPA
## 3 AK 224 Pleasure Town IPA 1814 0.063 61 American IPA
## 4 AK 494 Fairweather IPA 648 0.061 64 American IPA
## 5 AK 454 Twister Creek India Pale Ale 947 0.065 71 American IPA
## 6 AK 224 Sockeye Red IPA 434 0.057 70 American IPA
## Ounces Brewery City StateName Region Division
## 1 12 King Street Brewing Company Anchorage Alaska West Pacific
## 2 12 Midnight Sun Brewing Company Anchorage Alaska West Pacific
## 3 12 Midnight Sun Brewing Company Anchorage Alaska West Pacific
## 4 12 Broken Tooth Brewing Company Anchorage Alaska West Pacific
## 5 12 Denali Brewing Company Talkeetna Alaska West Pacific
## 6 12 Midnight Sun Brewing Company Anchorage Alaska West Pacific
#sumnmarise a count of IPAs, median abv and ibu by region
ipaCountByDivision <- ipaDF %>% group_by(Division) %>% summarise(CountByDivision = n())
ipaMedianABVByDivision <- ipaDF %>% group_by(Division) %>% summarise(Median_ABV = median(ABV))
ipaMedianIBUByDivision <- ipaDF %>% group_by(Division) %>% summarise(Median_IBU = median(IBU))
#Plot the counts by division
ggplot(data = ipaCountByDivision, aes(x= Division, y = CountByDivision ,fill = CountByDivision))+geom_bar(stat='identity', width=0.7)+ ggtitle ("Number of IPAs in each Census Division") + scale_fill_gradient2(midpoint=mean(ipaCountByDivision$CountByDivision), low= '#971B1E', mid = '#E02124', high = '#FFF200', space='#E1E1E1') + coord_flip() + geom_text(data = ipaCountByDivision, aes(x = Division, label = CountByDivision), position = position_dodge(width = 1), hjust = -0.1, size = 4) + scale_y_continuous(limits = c(0,max(ipaCountByDivision$CountByDivision) *1.1))
#Plot the median ABV by division
ggplot(data = ipaMedianABVByDivision, aes(x= Division, y = Median_ABV ,fill = Median_ABV))+geom_bar(stat='identity', width=0.7)+ ggtitle ("Median ABV of IPAs in each Census Division") + scale_fill_gradient2(midpoint=mean(ipaMedianABVByDivision$Median_ABV), low= '#971B1E', mid = '#E02124', high = '#FFF200', space='#E1E1E1') + coord_flip() + geom_text(data = ipaMedianABVByDivision, aes(x = Division, label = paste(Median_ABV*100, " %", "" )), position = position_dodge(width = 1), hjust = -0.1, size = 4) + scale_y_continuous(limits = c(0,max(ipaMedianABVByDivision$Median_ABV) *1.1))
#Plot the median IBU by Division
ggplot(data = ipaMedianIBUByDivision, aes(x= Division, y = Median_IBU ,fill = Median_IBU))+geom_bar(stat='identity', width=0.7)+ ggtitle ("Median IBU of IPAs in each Census Division") + scale_fill_gradient2(midpoint=mean(ipaMedianIBUByDivision$Median_IBU), low= '#971B1E', mid = '#E02124', high = '#FFF200', space='#E1E1E1') + coord_flip() + geom_text(data = ipaMedianIBUByDivision, aes(x = Division, label = Median_IBU), position = position_dodge(width = 1), hjust = -0.1, size = 4) + scale_y_continuous(limits = c(0,max(ipaMedianIBUByDivision$Median_IBU) *1.1))
| Measure | Value |
|---|---|
| Max ABV | 7.2% |
| Min ABV | 3.2% |
| Max IBU | 80 |
| Min IBU | 8 |
| Median ABV | 4.9% |
| Median IBU | 19 |
| Measure | Value |
|---|---|
| Max ABV | 9.9% |
| Min ABV | 3.8% |
| Max IBU | 138 |
| Min IBU | 30 |
| Median ABV | 6.8% |
| Median IBU | 70 |
| Measure | Value |
|---|---|
| Max ABV | 7.0% |
| Min ABV | 3.7% |
| Max IBU | 68 |
| Min IBU | 16 |
| Median ABV | 5.5% |
| Median IBU | 44 |