Analysis of Warren Buffett letters to investors
Ram annepu
7 July 2018
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
ifelse(require(dplyr),{library("dplyr")},install.packages("dplyr"))
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] "dplyr"
ifelse(require(tidytext),{library("tidytext")},install.packages("tidytext"))
## Loading required package: tidytext
## [1] "tidytext"
ifelse(require(tidyr),{library("tidyr")},install.packages("tidyr"))
## Loading required package: tidyr
## [1] "tidyr"
ifelse(require(tibble),{library("tibble")},install.packages("tibble"))
## Loading required package: tibble
## [1] "tibble"
ifelse(require(XML),{library("XML")},install.packages("XML"))
## Loading required package: XML
## [1] "XML"
ifelse(require(wordcloud),{library("wordcloud")},install.packages("wordcloud"))
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## [1] "wordcloud"
ifelse(require(rvest),{library("rvest")},install.packages("rvest"))
## Loading required package: rvest
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
## [1] "rvest"
ifelse(require(pdftools),{library("pdftools")},install.packages("pdftools"))
## Loading required package: pdftools
## [1] "pdftools"
ifelse(require(stringr),{library("stringr")},install.packages("stringr"))
## Loading required package: stringr
## [1] "stringr"
ifelse(require(splitstackshape),{library("splitstackshape")},install.packages("splitstackshape"))
## Loading required package: splitstackshape
## [1] "splitstackshape"
ifelse(require(sentimentr),{library("sentimentr")},install.packages("sentimentr"))
## Loading required package: sentimentr
## [1] "sentimentr"
ifelse(require(tm),{library("tm")},install.packages("tm"))
## Loading required package: tm
## Loading required package: NLP
## [1] "tm"
ifelse(require(plotly),{library("plotly")},install.packages("plotly"))
## Loading required package: plotly
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:sentimentr':
##
## highlight
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## [1] "plotly"
ifelse(require(treemapify),{library("treemapify")},install.packages("treemapify"))
## Loading required package: treemapify
## [1] "treemapify"
ifelse(require(treemap),{library("treemap")},install.packages("treemap"))
## Loading required package: treemap
## [1] "treemap"
#ifelse(require(viridis),{library("viridis")},install.packages("viridis"))
ifelse(require(reshape2),{library("reshape2")},install.packages("reshape2"))
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## [1] "reshape2"
ifelse(require(widyr),{library("widyr")},install.packages("widyr"))
## Loading required package: widyr
## [1] "widyr"
ifelse(require(ggraph),{library("ggraph")},install.packages("ggraph"))
## Loading required package: ggraph
##
## Attaching package: 'ggraph'
## The following object is masked from 'package:treemapify':
##
## geom_treemap
## [1] "ggraph"
ifelse(require(igraph),{library("igraph")},install.packages("igraph"))
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
##
## groups
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## [1] "igraph"
ifelse(require(udpipe),{library("udpipe")},install.packages("udpipe"))
## Loading required package: udpipe
## [1] "udpipe"
library(treemapify)
library(ggraph)
library(ggplot2)
library(tm)
thanks to Michael Toth, for expediting my work by scraping the letters from html pages and Pdf documents.
urls_77_97 <- paste('http://www.berkshirehathaway.com/letters/', seq(1977, 1997), '.html', sep='')
html_urls <- c(urls_77_97,'http://www.berkshirehathaway.com/letters/1998htm.html','http://www.berkshirehathaway.com/letters/1999htm.html','http://www.berkshirehathaway.com/2000ar/2000letter.html','http://www.berkshirehathaway.com/2001ar/2001letter.html')
letters_html <- lapply(html_urls, function(x) read_html(x) %>% html_text())
urls_03_18 <- paste('http://www.berkshirehathaway.com/letters/', seq(2003, 2018), 'ltr.pdf', sep = '')
pdf_urls <- data.frame('year' = seq(2002, 2018),'link' = c('http://www.berkshirehathaway.com/letters/2002pdf.pdf', urls_03_18))
download_pdfs <- function(x) {
file_name = paste0(x['year'], '.pdf')
download.file(url = x['link'], destfile = file_name, mode = 'wb')
return(file_name)
}
pdfs <- apply(pdf_urls, 1, download_pdfs)
letters_pdf <- lapply(pdfs, function(x) pdf_text(x) %>% paste(collapse=" "))
lapply(pdfs, function(x) if(file.exists(x)) file.remove(x))
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
##
## [[8]]
## [1] TRUE
##
## [[9]]
## [1] TRUE
##
## [[10]]
## [1] TRUE
##
## [[11]]
## [1] TRUE
##
## [[12]]
## [1] TRUE
##
## [[13]]
## [1] TRUE
##
## [[14]]
## [1] TRUE
##
## [[15]]
## [1] TRUE
##
## [[16]]
## [1] TRUE
##
## [[17]]
## [1] TRUE
letters <- do.call(rbind, Map(data.frame, year=seq(1977, 2018), text=c(letters_html, letters_pdf)))
letters$text <- as.character(letters$text)
letters$text<-lapply(letters$text, function(x) as.character(gsub("\r?\n|\r|\t", " ", x)))
letters$text<-lapply(letters$text, function(x) as.character(gsub("[^[:alnum:][:space:].,-]", "", x)))
#letters$text<-lapply(letters$text, function(x) as.character(gsub("(\\s){2,}", " ", x)))
#letters$text<-lapply(letters$text, function(x) as.character(gsub("(.\\s){2,}", "", str_trim(x))))
#letters$text<-lapply(letters$text, function(x) as.character(gsub("(.){2,}", "", str_trim(x))))
#letters$text<-lapply(letters$text, function(x) as.character(gsub("-+", "-", str_trim(x))))
#letters<-cSplit(letters, 'text', '.', 'long')
letters_lines<-letters%>%
group_by(year) %>%
mutate(linenumber = row_number()) %>%
ungroup
letters_lines$text <- as.character(letters_lines$text)
sentimentr is used here because it can capture negations in sentences. whereas other NLP dictionaries, such as bing and affin, can only analyse tokens and omits negation words, sentimentr analyses corpus sentence by sentence. hence, it gives us a more accurate sentiment score for a sentence by considering negations in the sentence
letters %>%
unnest()%>%
sentimentr::get_sentences() %>%
sentimentr::sentiment() %>%
mutate(characters = nchar(stripWhitespace(text))) %>%
filter(characters >1 ) -> senti_sentences
senti_sentences$sentiment <- as.numeric(senti_sentences$sentiment)
senti_sentences$pntag <- ifelse(senti_sentences$sentiment == 0, 'Neutral',
ifelse(senti_sentences$sentiment > 0, 'Positive',
ifelse(senti_sentences$sentiment < 0, 'Negative', 'NA')))
senti_sentences %>%
group_by(year) %>%
mutate(sentence_id = row_number()) %>%
ungroup() ->senti_sentences
ax <- list(
title = "Line Number",
zeroline = FALSE,
showline = FALSE,
showticklabels = TRUE
)
pal <- c("red", "blue", "green")
subplot(nrows=2,shareX = TRUE, shareY = TRUE,
plot_ly(data = senti_sentences%>%filter(year %in% "1984"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax),
plot_ly(data = senti_sentences%>%filter(year %in% "1985"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax, legend=FALSE ),
plot_ly(data = senti_sentences%>%filter(year %in% "1986"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "1987"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "1988"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "1989"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "1990"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "1991"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE)
)
pal <- c("red", "blue", "green")
subplot(nrows=2,shareX = TRUE, shareY = TRUE,
plot_ly(data = senti_sentences%>%filter(year %in% "2011")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE,title="letters of years 2011 to 2018, from left to right"),
plot_ly(data = senti_sentences%>%filter(year %in% "2012")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "2013")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "2014")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "2015")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "2016")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "2017")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
plot_ly(data = senti_sentences%>%filter(year %in% "2018")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE)
)
#------------ Emotion analysis-----------------------------------------------------------------------------
emotion.nrc <- letters %>%
unnest_tokens(word, text) %>% # split text into words
anti_join(stop_words, by = "word") %>% # remove stop words
filter(!grepl('[0-9]', word)) %>% # remove numbers
left_join(get_sentiments("nrc"), by = "word") # add sentiment scores to words
emotion.nrc %>%
filter(sentiment !="NA") ->filtered_emo
filtered_emo %>%
group_by(sentiment) %>%
summarise(n = n()) %>%
mutate(percent=(n/sum(n))*100)%>%
ggplot2::ggplot( aes(area = percent, fill = sentiment, label = paste(round(percent,1),"%:",sentiment))) +
treemapify::geom_treemap() +
treemapify::geom_treemap_text(fontface = "italic", colour = "white", place = "centre",grow = TRUE)+scale_fill_brewer(palette = "Set3")
filtered_emo %>%
group_by(year,sentiment) %>%
summarise(n = n()) %>%
mutate(percent=(n/sum(n))*100)%>%
ungroup() %>%
ggplot( aes(x=year, y = percent, fill =sentiment,label=round(percent,0)) )+
geom_bar(position = "stack", stat = "identity")+
geom_text(size = 3, position = position_stack(vjust = 0.5))+
scale_fill_brewer(palette = "Set3")
here are the plots of word clouds to analyse how his writting has changed over the years
senti.bing <- letters %>%
unnest_tokens(word, text) %>% # split text into words
anti_join(stop_words, by = "word") %>% # remove stop words
filter(!grepl('[0-9]', word)) %>% # remove numbers
inner_join(get_sentiments("bing"), by = "word")
senti.bing%>%
filter(year %in% (1977:1991))%>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% #convert tibble into matrix
comparison.cloud(colors = c("orange", "blue"),
max.words = 300)+title("from year 1977 to 1981")
## integer(0)
senti.bing%>%
filter(year %in% (2004:2018))%>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% #convert tibble into matrix
comparison.cloud(colors = c("orange", "blue"),
max.words = 300)+title("from year 2004 to 2018")
## integer(0)
top.words_tfidf<-senti.bing %>%
group_by(word, sentiment,year) %>%
count(word) %>%
bind_tf_idf(word,year,n)%>%
filter(sentiment %in% c("positive","negative")) %>%
arrange(desc(tf_idf)) %>%
ungroup
top.words_tfidf %>%
filter(tf_idf > 0.013 ) %>%
mutate(tf_idf = ifelse(sentiment == "negative", -tf_idf, tf_idf)) %>%
mutate(word = reorder(word, tf_idf)) %>%
plot_ly( x = ~word, y = ~tf_idf, type = 'bar',color = ~sentiment,hovertext=~year) %>%
layout(xaxis = list(title = "word", tickangle = -45),
yaxis = list(title = "Tf_idf"))
top_words_year<-top.words_tfidf %>% group_by(year,sentiment) %>% top_n(2, tf_idf)%>%
arrange(desc(year,sentiment))
letters$text<-as.character(letters$text)
trigrams_letters <- letters %>%
unnest_tokens(trigram, text,
token = "ngrams", n = 3) %>%
filter(!grepl('[0-9]', trigram))
trigrams_separated <- trigrams_letters %>%
separate(trigram, c("word1", "word2","word3"), sep = " ")
trigrams_filtered <- trigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
trigram_counts <- trigrams_filtered %>%
count(word1, word2,word3, sort = TRUE)
trigrams_united <- trigrams_filtered %>%
unite(trigram, word1, word2,word3, sep = " ") %>%
group_by(year, trigram) %>%
count(year,trigram, sort = TRUE)
head(trigrams_united,100)
## # A tibble: 100 x 3
## # Groups: year, trigram [100]
## year trigram n
## <int> <chr> <int>
## 1 1983 net tangible assets 12
## 2 2011 pre tax earnings 11
## 3 2007 pre tax earnings 10
## 4 1977 blue chip stamps 9
## 5 1996 super cat business 8
## 6 2014 berkshire hathaway energy 7
## 7 1979 blue chip stamps 6
## 8 1980 reported operating earnings 6
## 9 1983 nebraska furniture mart 6
## 10 2010 pre tax earnings 6
## # ... with 90 more rows
trigram_graph <- trigram_counts %>%
filter(n > 5) %>%
graph_from_data_frame()
ggraph(trigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
letter_words <- letters %>%
unnest_tokens(word, text) %>%
filter(!grepl('[0-9]', word))%>%
filter(!word %in% stop_words$word)
word_pairs <- letter_words %>%
pairwise_count(word, year, sort = TRUE)
word_cors <- letter_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, year, sort = TRUE)
word_cors %>%
filter(correlation > .9) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
#-------------------------- Extract phrases -----------------------------------
#extracting collocated nouns and adjective
en <- udpipe_download_model(language = "english")
udmodel_english <- udpipe_load_model(file = en$file_model)
anno_words <- udpipe_annotate(udmodel_english, x = letters$text)
anno_words <- as.data.frame(anno_words)
#anno_words %>%keywords_collocation(term = "token", group = c("doc_id", "sentence_id"),ngram_max = 5)->phrase_words
## Co-occurrences: How frequent do words occur in the same sentence, in this case only nouns or adjectives
stats <- cooccurrence(x = anno_words$lemma,
relevant = anno_words$upos %in% c("NOUN", "ADJ"), group = c("doc_id", "sentence_id"))
## Co-occurrences: How frequent do words follow one another
## Co-occurrences: How frequent do words follow one another even if we would skip 2 words in between
# stats <- cooccurrence(x = anno_words$lemma,
# relevant = anno_words$upos %in% c("NOUN", "ADJ"), skipgram = 2)
wordnetwork <- head(stats, 300)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
theme(legend.position = "none") +
labs(title = "Cooccurrences within 3 words distance", subtitle = "Nouns & Adjective")
stats<-as.data.frame(stats)
head(stats,200)
## term1 term2 cooc
## 1 last year 692
## 2 book value 241
## 3 intrinsic value 237
## 4 net worth 217
## 5 annual meeting 167
## 6 market value 139
## 7 business value 137
## 8 balance sheet 134
## 9 annual report 133
## 10 insurance business 124
## 11 common stock 120
## 12 insurance operation 119
## 13 few year 115
## 14 underwriting profit 106
## 15 insurance company 105
## 16 capital gain 104
## 17 recent year 102
## 18 operating earning 87
## 19 many year 86
## 20 income tax 84
## 21 underwriting loss 81
## 22 intrinsic business 80
## 23 investment income 78
## 24 stock market 66
## 25 market price 61
## 26 financial statement 60
## 27 -share book 57
## 28 premium volume 55
## 29 stock price 54
## 30 interest rate 52
## 31 insurance industry 51
## 32 large amount 51
## 33 tangible asset 51
## 34 earn power 50
## 35 financial strength 50
## 36 tax rate 49
## 37 real estate 48
## 38 other word 47
## 39 american business 47
## 40 present management 45
## 41 pre-tax earning 45
## 42 accounting adjustment 43
## 43 equity capital 42
## 44 investment banker 42
## 45 other business 41
## 46 underwriting result 41
## 47 most case 41
## 48 contribution program 41
## 49 marketable security 40
## 50 net investment 40
## 51 minority interest 40
## 52 capital allocation 40
## 53 interest expense 40
## 54 auto insurance 39
## 55 year period 38
## 56 economic characteristic 38
## 57 new business 38
## 58 meeting credential 38
## 59 net earning 37
## 60 purchase price 37
## 61 market share 37
## 62 b share 37
## 63 percentage point 35
## 64 competitive advantage 35
## 65 good business 34
## 66 other major 34
## 67 loss reserve 34
## 68 stock investment 33
## 69 long time 33
## 70 earning statement 33
## 71 large sum 32
## 72 great many 32
## 73 large part 32
## 74 net tangible 32
## 75 per -share 32
## 76 derivative contract 32
## 77 many other 31
## 78 proxy material 31
## 79 current asset 31
## 80 index fund 31
## 81 total current 30
## 82 current liability 30
## 83 cat business 30
## 84 total Earning 29
## 85 good news 29
## 86 tax earning 29
## 87 -share intrinsic 29
## 88 textile business 28
## 89 unrealized gain 28
## 90 operating manager 28
## 91 accounting rule 28
## 92 good return 28
## 93 other current 28
## 94 public company 27
## 95 profit margin 27
## 96 net income 27
## 97 next year 27
## 98 tax return 27
## 99 facing page 27
## 100 most year 26
## 101 corporate performance 26
## 102 loss cost 26
## 103 other insurer 26
## 104 vice Chairman 26
## 105 electric utility 26
## 106 more money 25
## 107 annual earning 25
## 108 undistributed earning 25
## 109 low price 25
## 110 parent company 25
## 111 true economic 25
## 112 term debt 25
## 113 depreciation charge 25
## 114 acre site 25
## 115 other hand 24
## 116 investment manager 24
## 117 amortization charge 24
## 118 many people 24
## 119 several year 23
## 120 year report 23
## 121 tax cost 23
## 122 low cost 23
## 123 major investee 23
## 124 equity investment 22
## 125 pension fund 22
## 126 -share earning 22
## 127 major way 22
## 128 actual owner 22
## 129 nominee name 22
## 130 many decade 22
## 131 capital expenditure 22
## 132 key figure 22
## 133 shareholder discount 22
## 134 special pricing 22
## 135 long term 21
## 136 worker compensation 21
## 137 casualty insurance 21
## 138 reinsurance business 21
## 139 few case 21
## 140 economic value 21
## 141 negative return 21
## 142 other company 21
## 143 new shareholder 21
## 144 stock option 21
## 145 specific business 21
## 146 huge amount 21
## 147 other respects 21
## 148 report apply 21
## 149 sees candies 20
## 150 entire business 20
## 151 insurance manager 20
## 152 past decade 20
## 153 positive return 20
## 154 past report 20
## 155 past year 20
## 156 eligible share 20
## 157 future program 20
## 158 federal income 20
## 159 percentage change 20
## 160 wide variety 20
## 161 many company 20
## 162 cash equivalent 20
## 163 Financial product 20
## 164 other event 20
## 165 top counselor 20
## 166 annual percentage 20
## 167 calendar year 20
## 168 utility operation 20
## 169 other intangible 20
## 170 cost market 19
## 171 economic reality 19
## 172 tax basis 19
## 173 good reason 19
## 174 term bond 19
## 175 last few 19
## 176 next few 19
## 177 corporate tax 19
## 178 next decade 19
## 179 other time 19
## 180 bad news 19
## 181 price accounting 19
## 182 other asset 19
## 183 insurance world 19
## 184 auto insurer 19
## 185 main gala 19
## 186 equity securitie 19
## 187 corporation such 19
## 188 appropriate tax 19
## 189 aggregate lag 19
## 190 business model 19
## 191 hedge fund 19
## 192 insurance subsidiary 18
## 193 excellent business 18
## 194 small fraction 18
## 195 small portion 18
## 196 square foot 18
## 197 economic interest 18
## 198 future year 18
## 199 american corporation 18
## 200 business school 18
#Simple noun phrases (a adjective+noun, pre/postposition, optional determiner and another adjective+noun)
anno_words$phrase_tag <- as_phrasemachine(anno_words$upos, type = "upos")
stats <- keywords_phrases(x = anno_words$phrase_tag, term = anno_words$token,
pattern = "(A|N)+N(P+D*(A|N)*N)*",
is_regex = TRUE, ngram_max = 8, detailed = FALSE)
head(stats,100)
## keyword ngram freq
## 1 last year 2 365
## 2 Last year 2 248
## 3 book value 2 241
## 4 intrinsic value 2 236
## 5 which we 2 188
## 6 that we 2 170
## 7 our insurance 2 157
## 8 net worth 2 152
## 9 market value 2 129
## 10 business value 2 128
## 11 Berkshire Hathaway 2 121
## 12 annual meeting 2 120
## 13 insurance business 2 116
## 14 few years 2 115
## 15 balance sheet 2 114
## 16 Scott Fetzer 2 103
## 17 underwriting profit 2 102
## 18 recent years 2 102
## 19 our shareholders 2 98
## 20 Furniture Mart 2 98
## 21 capital gains 2 96
## 22 Nebraska Furniture 2 94
## 23 what we 2 92
## 24 operating earnings 2 90
## 25 Nebraska Furniture Mart 3 89
## 26 insurance companies 2 87
## 27 American Express 2 87
## 28 year we 2 86
## 29 our float 2 83
## 30 Blue Chip 2 82
## 31 many years 2 82
## 32 annual report 2 80
## 33 intrinsic business 2 80
## 34 Wells Fargo 2 80
## 35 common stock 2 79
## 36 Washington Post 2 77
## 37 SP 500 2 77
## 38 intrinsic business value 3 76
## 39 last years 2 75
## 40 General Re 2 74
## 41 that I 2 71
## 42 Berkshire shares 2 68
## 43 those who 2 66
## 44 10 a.m. 2 66
## 45 stock market 2 65
## 46 business that 2 65
## 47 Berkshire shareholders 2 64
## 48 businesses that 2 64
## 49 our businesses 2 64
## 50 Wall Street 2 63
## 51 Mrs. B 2 63
## 52 insurance operation 2 62
## 53 our purchase 2 62
## 54 New York 2 61
## 55 companies that 2 60
## 56 which they 2 59
## 57 ten years 2 59
## 58 our operating 2 57
## 59 National Indemnity 2 57
## 60 financial statements 2 57
## 61 underwriting loss 2 57
## 62 -share book 2 57
## 63 6 p.m. 2 57
## 64 insurance operations 2 56
## 65 -share book value 3 56
## 66 premium volume 2 53
## 67 earning power 2 52
## 68 World Book 2 52
## 69 insurance industry 2 51
## 70 what they 2 51
## 71 five years 2 51
## 72 tangible assets 2 51
## 73 financial strength 2 50
## 74 our managers 2 50
## 75 Washington Post Company 3 49
## 76 Post Company 2 49
## 77 our shareholder 2 49
## 78 pre-tax earnings 2 49
## 79 real estate 2 48
## 80 Buffalo News 2 48
## 81 managers who 2 47
## 82 other words 2 47
## 83 year I 2 47
## 84 Berkshire Hathaway Inc 3 46
## 85 Hathaway Inc 2 46
## 86 investment income 2 46
## 87 our holdings 2 46
## 88 years that 2 46
## 89 Last year we 3 46
## 90 Warren E. 2 45
## 91 Warren E. Buffett 3 45
## 92 E. Buffett 2 45
## 93 HATHAWAY INC 2 43
## 94 our cost 2 43
## 95 our ownership 2 43
## 96 earnings that 2 43
## 97 present management 2 43
## 98 Charlie Munger 2 43
## 99 their businesses 2 43
## 100 equity capital 2 42
#most occuring adjectives
library(lattice)
stats <- subset(anno_words, upos %in% c("ADJ"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 30), col = "cadetblue",
main = "Most occurring adjectives", xlab = "Freq")
library(lattice)
stats <- subset(anno_words, upos %in% c("ADV"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 30), col = "cadetblue",
main = "Most occurring adverbs", xlab = "Freq")
#finding key words
stats <- keywords_rake(x = anno_words, term = "lemma", group = "doc_id",
relevant = anno_words$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, doc_id="doc1"), 20), col = "cadetblue",
main = "Keywords identified by RAKE",
xlab = "Rake")
anno_words$word <- tolower(anno_words$token)
stats <- keywords_collocation(x = anno_words, term = "word", group = "doc_id")
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ pmi, data = head(subset(stats, freq > 10), 20), col = "cadetblue",
main = "Keywords identified by PMI Collocation",
xlab = "PMI (Pointwise Mutual Information)")