knitr::opts_chunk$set(
                echo = TRUE,
                message = FALSE,
                warning = FALSE
            )
            ifelse(require(dplyr),{library("dplyr")},install.packages("dplyr"))
## Loading required package: dplyr
## 
            ## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
            ## 
            ##     filter, lag
## The following objects are masked from 'package:base':
            ## 
            ##     intersect, setdiff, setequal, union
## [1] "dplyr"
ifelse(require(tidytext),{library("tidytext")},install.packages("tidytext"))
## Loading required package: tidytext
## [1] "tidytext"
ifelse(require(tidyr),{library("tidyr")},install.packages("tidyr"))
## Loading required package: tidyr
## [1] "tidyr"
ifelse(require(tibble),{library("tibble")},install.packages("tibble"))
## Loading required package: tibble
## [1] "tibble"
ifelse(require(XML),{library("XML")},install.packages("XML"))
## Loading required package: XML
## [1] "XML"
ifelse(require(wordcloud),{library("wordcloud")},install.packages("wordcloud"))
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## [1] "wordcloud"
ifelse(require(rvest),{library("rvest")},install.packages("rvest"))
## Loading required package: rvest
## Loading required package: xml2
## 
            ## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
            ## 
            ##     xml
## [1] "rvest"
ifelse(require(pdftools),{library("pdftools")},install.packages("pdftools"))
## Loading required package: pdftools
## [1] "pdftools"
ifelse(require(stringr),{library("stringr")},install.packages("stringr"))
## Loading required package: stringr
## [1] "stringr"
ifelse(require(splitstackshape),{library("splitstackshape")},install.packages("splitstackshape"))
## Loading required package: splitstackshape
## [1] "splitstackshape"
ifelse(require(sentimentr),{library("sentimentr")},install.packages("sentimentr"))
## Loading required package: sentimentr
## [1] "sentimentr"
ifelse(require(tm),{library("tm")},install.packages("tm"))
## Loading required package: tm
## Loading required package: NLP
## [1] "tm"
ifelse(require(plotly),{library("plotly")},install.packages("plotly"))
## Loading required package: plotly
## Loading required package: ggplot2
## 
            ## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
            ## 
            ##     annotate
## 
            ## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
            ## 
            ##     last_plot
## The following object is masked from 'package:sentimentr':
            ## 
            ##     highlight
## The following object is masked from 'package:stats':
            ## 
            ##     filter
## The following object is masked from 'package:graphics':
            ## 
            ##     layout
## [1] "plotly"
ifelse(require(treemapify),{library("treemapify")},install.packages("treemapify"))
## Loading required package: treemapify
## [1] "treemapify"
ifelse(require(treemap),{library("treemap")},install.packages("treemap"))
## Loading required package: treemap
## [1] "treemap"
#ifelse(require(viridis),{library("viridis")},install.packages("viridis"))
            ifelse(require(reshape2),{library("reshape2")},install.packages("reshape2"))
## Loading required package: reshape2
## 
            ## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
            ## 
            ##     smiths
## [1] "reshape2"
ifelse(require(widyr),{library("widyr")},install.packages("widyr"))
## Loading required package: widyr
## [1] "widyr"
ifelse(require(ggraph),{library("ggraph")},install.packages("ggraph"))
## Loading required package: ggraph
## 
            ## Attaching package: 'ggraph'
## The following object is masked from 'package:treemapify':
            ## 
            ##     geom_treemap
## [1] "ggraph"
ifelse(require(igraph),{library("igraph")},install.packages("igraph"))
## Loading required package: igraph
## 
            ## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
            ## 
            ##     groups
## The following object is masked from 'package:tibble':
            ## 
            ##     as_data_frame
## The following object is masked from 'package:tidyr':
            ## 
            ##     crossing
## The following objects are masked from 'package:dplyr':
            ## 
            ##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
            ## 
            ##     decompose, spectrum
## The following object is masked from 'package:base':
            ## 
            ##     union
## [1] "igraph"
ifelse(require(udpipe),{library("udpipe")},install.packages("udpipe"))
## Loading required package: udpipe
## [1] "udpipe"
library(treemapify)
            library(ggraph)
            library(ggplot2)
            library(tm)

thanks to Michael Toth, for expediting my work by scraping the letters from html pages and Pdf documents.

urls_77_97 <- paste('http://www.berkshirehathaway.com/letters/', seq(1977, 1997), '.html', sep='')
            
            html_urls <- c(urls_77_97,'http://www.berkshirehathaway.com/letters/1998htm.html','http://www.berkshirehathaway.com/letters/1999htm.html','http://www.berkshirehathaway.com/2000ar/2000letter.html','http://www.berkshirehathaway.com/2001ar/2001letter.html')
            
            letters_html <- lapply(html_urls, function(x) read_html(x) %>% html_text())
            
            urls_03_18 <- paste('http://www.berkshirehathaway.com/letters/', seq(2003, 2018), 'ltr.pdf', sep = '')
            
            pdf_urls <- data.frame('year' = seq(2002, 2018),'link' = c('http://www.berkshirehathaway.com/letters/2002pdf.pdf', urls_03_18))
            
            
            
            download_pdfs <- function(x) {
              
              file_name = paste0(x['year'], '.pdf')
              
              download.file(url = x['link'], destfile = file_name, mode = 'wb')
              
              return(file_name)
              
            }
            
            
            pdfs <- apply(pdf_urls, 1, download_pdfs)
            
            letters_pdf <- lapply(pdfs, function(x) pdf_text(x) %>% paste(collapse=" "))
            
            lapply(pdfs, function(x) if(file.exists(x)) file.remove(x)) 
## [[1]]
            ## [1] TRUE
            ## 
            ## [[2]]
            ## [1] TRUE
            ## 
            ## [[3]]
            ## [1] TRUE
            ## 
            ## [[4]]
            ## [1] TRUE
            ## 
            ## [[5]]
            ## [1] TRUE
            ## 
            ## [[6]]
            ## [1] TRUE
            ## 
            ## [[7]]
            ## [1] TRUE
            ## 
            ## [[8]]
            ## [1] TRUE
            ## 
            ## [[9]]
            ## [1] TRUE
            ## 
            ## [[10]]
            ## [1] TRUE
            ## 
            ## [[11]]
            ## [1] TRUE
            ## 
            ## [[12]]
            ## [1] TRUE
            ## 
            ## [[13]]
            ## [1] TRUE
            ## 
            ## [[14]]
            ## [1] TRUE
            ## 
            ## [[15]]
            ## [1] TRUE
            ## 
            ## [[16]]
            ## [1] TRUE
            ## 
            ## [[17]]
            ## [1] TRUE
letters <- do.call(rbind, Map(data.frame, year=seq(1977, 2018), text=c(letters_html, letters_pdf)))
            
            letters$text <- as.character(letters$text)
            letters$text<-lapply(letters$text, function(x) as.character(gsub("\r?\n|\r|\t", " ", x)))
            letters$text<-lapply(letters$text, function(x) as.character(gsub("[^[:alnum:][:space:].,-]", "", x)))
            #letters$text<-lapply(letters$text, function(x) as.character(gsub("(\\s){2,}", " ", x)))
            #letters$text<-lapply(letters$text, function(x) as.character(gsub("(.\\s){2,}", "", str_trim(x))))
            #letters$text<-lapply(letters$text, function(x) as.character(gsub("(.){2,}", "", str_trim(x))))
            #letters$text<-lapply(letters$text, function(x) as.character(gsub("-+", "-", str_trim(x))))
            #letters<-cSplit(letters, 'text', '.', 'long')
            
            
            
            letters_lines<-letters%>%
              group_by(year) %>%
                       mutate(linenumber = row_number()) %>%
                       ungroup
            
            letters_lines$text <- as.character(letters_lines$text)

sentimentr is used here because it can capture negations in sentences. whereas other NLP dictionaries, such as bing and affin, can only analyse tokens and omits negation words, sentimentr analyses corpus sentence by sentence. hence, it gives us a more accurate sentiment score for a sentence by considering negations in the sentence

letters %>% 
              unnest()%>%
              sentimentr::get_sentences() %>%
              sentimentr::sentiment() %>% 
              mutate(characters = nchar(stripWhitespace(text))) %>% 
              filter(characters >1 ) -> senti_sentences 
            
            senti_sentences$sentiment <- as.numeric(senti_sentences$sentiment)
            senti_sentences$pntag <- ifelse(senti_sentences$sentiment == 0, 'Neutral',
                                     ifelse(senti_sentences$sentiment > 0, 'Positive',
                                            ifelse(senti_sentences$sentiment < 0, 'Negative', 'NA')))
            
            senti_sentences %>%
              group_by(year) %>%
              mutate(sentence_id = row_number()) %>%
              ungroup() ->senti_sentences
              
            ax <- list(
              title = "Line Number",
              zeroline = FALSE,
              showline = FALSE,
              showticklabels = TRUE
            )
            
            pal <- c("red", "blue", "green")
            subplot(nrows=2,shareX = TRUE, shareY = TRUE, 
                  plot_ly(data = senti_sentences%>%filter(year %in% "1984"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax),
             plot_ly(data = senti_sentences%>%filter(year %in% "1985"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax, legend=FALSE ),
             plot_ly(data = senti_sentences%>%filter(year %in% "1986"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
             plot_ly(data = senti_sentences%>%filter(year %in% "1987"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
             plot_ly(data = senti_sentences%>%filter(year %in% "1988"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
             plot_ly(data = senti_sentences%>%filter(year %in% "1989"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
             plot_ly(data = senti_sentences%>%filter(year %in% "1990"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
             plot_ly(data = senti_sentences%>%filter(year %in% "1991"),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                     type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE)
             
             
             
            )
pal <- c("red", "blue", "green")
            subplot(nrows=2,shareX = TRUE, shareY = TRUE,
                    plot_ly(data = senti_sentences%>%filter(year %in% "2011")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE,title="letters of years 2011 to 2018, from left to right"),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2012")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2013")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2014")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2015")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2016")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2017")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE),
                    plot_ly(data = senti_sentences%>%filter(year %in% "2018")%>%filter(pntag %in% c("Positive","Negative")),showlegend=F, x = ~sentence_id, y = ~sentiment, color = ~pntag,text = ~text,
                            type = 'scatter', mode = 'markers',colors = pal,opacity = 0.8) %>% layout(xaxis = ax,legend=FALSE)
            )
#------------  Emotion analysis-----------------------------------------------------------------------------
            emotion.nrc <- letters %>%
              unnest_tokens(word, text) %>%                           # split text into words
              anti_join(stop_words, by = "word") %>%                  # remove stop words
              filter(!grepl('[0-9]', word)) %>%                       # remove numbers
              left_join(get_sentiments("nrc"), by = "word")    # add sentiment scores to words
              
            emotion.nrc %>%
              filter(sentiment !="NA") ->filtered_emo 
            
            filtered_emo %>%
                group_by(sentiment) %>% 
                summarise(n = n()) %>%
                mutate(percent=(n/sum(n))*100)%>%
                ggplot2::ggplot( aes(area = percent, fill = sentiment, label = paste(round(percent,1),"%:",sentiment))) +
              treemapify::geom_treemap() +
              treemapify::geom_treemap_text(fontface = "italic", colour = "white", place = "centre",grow = TRUE)+scale_fill_brewer(palette = "Set3")

filtered_emo %>%
              group_by(year,sentiment) %>% 
              summarise(n = n()) %>%
              mutate(percent=(n/sum(n))*100)%>%
              ungroup() %>%
              ggplot( aes(x=year, y = percent, fill =sentiment,label=round(percent,0)) )+
              geom_bar(position = "stack", stat = "identity")+
              geom_text(size = 3, position = position_stack(vjust = 0.5))+
              scale_fill_brewer(palette = "Set3")

here are the plots of word clouds to analyse how his writting has changed over the years

senti.bing <- letters %>%
              unnest_tokens(word, text) %>%                           # split text into words
              anti_join(stop_words, by = "word") %>%                  # remove stop words
              filter(!grepl('[0-9]', word)) %>%                       # remove numbers
              inner_join(get_sentiments("bing"), by = "word") 
            
            
            senti.bing%>%
              filter(year %in% (1977:1991))%>%
              count(word, sentiment, sort = TRUE) %>%
              acast(word ~ sentiment, value.var = "n", fill = 0) %>%  #convert tibble into matrix
              comparison.cloud(colors = c("orange", "blue"),
                               max.words = 300)+title("from year 1977 to 1981")

## integer(0)
senti.bing%>%
              filter(year %in% (2004:2018))%>%
              count(word, sentiment, sort = TRUE) %>%
              acast(word ~ sentiment, value.var = "n", fill = 0) %>%  #convert tibble into matrix
              comparison.cloud(colors = c("orange", "blue"),
                               max.words = 300)+title("from year 2004 to 2018")

## integer(0)
top.words_tfidf<-senti.bing %>%
              group_by(word, sentiment,year) %>%
              count(word) %>%
              bind_tf_idf(word,year,n)%>%
              filter(sentiment %in% c("positive","negative")) %>%
              arrange(desc(tf_idf)) %>%
              ungroup
            
            
            top.words_tfidf %>%
              filter(tf_idf > 0.013 ) %>%
              mutate(tf_idf = ifelse(sentiment == "negative", -tf_idf, tf_idf)) %>%
              mutate(word = reorder(word, tf_idf)) %>%
              plot_ly( x = ~word, y = ~tf_idf, type = 'bar',color = ~sentiment,hovertext=~year) %>%
              layout(xaxis = list(title = "word", tickangle = -45),
                     yaxis = list(title = "Tf_idf"))
top_words_year<-top.words_tfidf %>% group_by(year,sentiment) %>% top_n(2, tf_idf)%>%
              arrange(desc(year,sentiment))
letters$text<-as.character(letters$text)
            trigrams_letters <- letters %>%
              unnest_tokens(trigram, text, 
                            token = "ngrams", n = 3) %>%
              filter(!grepl('[0-9]', trigram))
              
            trigrams_separated <- trigrams_letters %>%
              separate(trigram, c("word1", "word2","word3"), sep = " ")
            trigrams_filtered <- trigrams_separated %>%
              filter(!word1 %in% stop_words$word) %>%
              filter(!word2 %in% stop_words$word) %>%
              filter(!word3 %in% stop_words$word)
            
            trigram_counts <- trigrams_filtered %>% 
              count(word1, word2,word3, sort = TRUE)
            
            
            trigrams_united <- trigrams_filtered %>%
              unite(trigram, word1, word2,word3, sep = " ") %>% 
              group_by(year, trigram) %>%
              count(year,trigram, sort = TRUE)
            head(trigrams_united,100)
## # A tibble: 100 x 3
            ## # Groups:   year, trigram [100]
            ##     year trigram                         n
            ##    <int> <chr>                       <int>
            ##  1  1983 net tangible assets            12
            ##  2  2011 pre tax earnings               11
            ##  3  2007 pre tax earnings               10
            ##  4  1977 blue chip stamps                9
            ##  5  1996 super cat business              8
            ##  6  2014 berkshire hathaway energy       7
            ##  7  1979 blue chip stamps                6
            ##  8  1980 reported operating earnings     6
            ##  9  1983 nebraska furniture mart         6
            ## 10  2010 pre tax earnings                6
            ## # ... with 90 more rows
trigram_graph <- trigram_counts %>%
              filter(n > 5) %>%
              graph_from_data_frame()
            
            ggraph(trigram_graph, layout = "fr") +
              geom_edge_link() +
              geom_node_point() +
              geom_node_text(aes(label = name), vjust = 1, hjust = 1)

letter_words <- letters %>%
              unnest_tokens(word, text) %>%
              filter(!grepl('[0-9]', word))%>%
              filter(!word %in% stop_words$word)
            
            word_pairs <- letter_words %>%
              pairwise_count(word, year, sort = TRUE)
            
            word_cors <- letter_words %>%
              group_by(word) %>%
              filter(n() >= 20) %>%
              pairwise_cor(word, year, sort = TRUE)
            
            word_cors %>%
              filter(correlation > .9) %>%
              graph_from_data_frame() %>%
              ggraph(layout = "fr") +
              geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
              geom_node_point(color = "lightblue", size = 5) +
              geom_node_text(aes(label = name), repel = TRUE) +
              theme_void()

#-------------------------- Extract phrases -----------------------------------
            #extracting collocated nouns and adjective
            en <- udpipe_download_model(language = "english")
            udmodel_english <- udpipe_load_model(file = en$file_model)
            anno_words <- udpipe_annotate(udmodel_english, x = letters$text)
            anno_words <- as.data.frame(anno_words)
#anno_words %>%keywords_collocation(term = "token", group = c("doc_id", "sentence_id"),ngram_max = 5)->phrase_words
            ## Co-occurrences: How frequent do words occur in the same sentence, in this case only nouns or adjectives
            stats <- cooccurrence(x = anno_words$lemma, 
                                  relevant = anno_words$upos %in% c("NOUN", "ADJ"), group = c("doc_id", "sentence_id"))
            ## Co-occurrences: How frequent do words follow one another
            
            ## Co-occurrences: How frequent do words follow one another even if we would skip 2 words in between
            # stats <- cooccurrence(x = anno_words$lemma, 
            #                       relevant = anno_words$upos %in% c("NOUN", "ADJ"), skipgram = 2)
wordnetwork <- head(stats, 300)
            wordnetwork <- graph_from_data_frame(wordnetwork)
            ggraph(wordnetwork, layout = "fr") +
              geom_edge_link() +
              geom_node_point() +
              geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
              theme(legend.position = "none") +
              labs(title = "Cooccurrences within 3 words distance", subtitle = "Nouns & Adjective")

stats<-as.data.frame(stats)
            head(stats,200)
##             term1          term2 cooc
            ## 1            last           year  692
            ## 2            book          value  241
            ## 3       intrinsic          value  237
            ## 4             net          worth  217
            ## 5          annual        meeting  167
            ## 6          market          value  139
            ## 7        business          value  137
            ## 8         balance          sheet  134
            ## 9          annual         report  133
            ## 10      insurance       business  124
            ## 11         common          stock  120
            ## 12      insurance      operation  119
            ## 13            few           year  115
            ## 14   underwriting         profit  106
            ## 15      insurance        company  105
            ## 16        capital           gain  104
            ## 17         recent           year  102
            ## 18      operating        earning   87
            ## 19           many           year   86
            ## 20         income            tax   84
            ## 21   underwriting           loss   81
            ## 22      intrinsic       business   80
            ## 23     investment         income   78
            ## 24          stock         market   66
            ## 25         market          price   61
            ## 26      financial      statement   60
            ## 27         -share           book   57
            ## 28        premium         volume   55
            ## 29          stock          price   54
            ## 30       interest           rate   52
            ## 31      insurance       industry   51
            ## 32          large         amount   51
            ## 33       tangible          asset   51
            ## 34           earn          power   50
            ## 35      financial       strength   50
            ## 36            tax           rate   49
            ## 37           real         estate   48
            ## 38          other           word   47
            ## 39       american       business   47
            ## 40        present     management   45
            ## 41        pre-tax        earning   45
            ## 42     accounting     adjustment   43
            ## 43         equity        capital   42
            ## 44     investment         banker   42
            ## 45          other       business   41
            ## 46   underwriting         result   41
            ## 47           most           case   41
            ## 48   contribution        program   41
            ## 49     marketable       security   40
            ## 50            net     investment   40
            ## 51       minority       interest   40
            ## 52        capital     allocation   40
            ## 53       interest        expense   40
            ## 54           auto      insurance   39
            ## 55           year         period   38
            ## 56       economic characteristic   38
            ## 57            new       business   38
            ## 58        meeting     credential   38
            ## 59            net        earning   37
            ## 60       purchase          price   37
            ## 61         market          share   37
            ## 62              b          share   37
            ## 63     percentage          point   35
            ## 64    competitive      advantage   35
            ## 65           good       business   34
            ## 66          other          major   34
            ## 67           loss        reserve   34
            ## 68          stock     investment   33
            ## 69           long           time   33
            ## 70        earning      statement   33
            ## 71          large            sum   32
            ## 72          great           many   32
            ## 73          large           part   32
            ## 74            net       tangible   32
            ## 75            per         -share   32
            ## 76     derivative       contract   32
            ## 77           many          other   31
            ## 78          proxy       material   31
            ## 79        current          asset   31
            ## 80          index           fund   31
            ## 81          total        current   30
            ## 82        current      liability   30
            ## 83            cat       business   30
            ## 84          total        Earning   29
            ## 85           good           news   29
            ## 86            tax        earning   29
            ## 87         -share      intrinsic   29
            ## 88        textile       business   28
            ## 89     unrealized           gain   28
            ## 90      operating        manager   28
            ## 91     accounting           rule   28
            ## 92           good         return   28
            ## 93          other        current   28
            ## 94         public        company   27
            ## 95         profit         margin   27
            ## 96            net         income   27
            ## 97           next           year   27
            ## 98            tax         return   27
            ## 99         facing           page   27
            ## 100          most           year   26
            ## 101     corporate    performance   26
            ## 102          loss           cost   26
            ## 103         other        insurer   26
            ## 104          vice       Chairman   26
            ## 105      electric        utility   26
            ## 106          more          money   25
            ## 107        annual        earning   25
            ## 108 undistributed        earning   25
            ## 109           low          price   25
            ## 110        parent        company   25
            ## 111          true       economic   25
            ## 112          term           debt   25
            ## 113  depreciation         charge   25
            ## 114          acre           site   25
            ## 115         other           hand   24
            ## 116    investment        manager   24
            ## 117  amortization         charge   24
            ## 118          many         people   24
            ## 119       several           year   23
            ## 120          year         report   23
            ## 121           tax           cost   23
            ## 122           low           cost   23
            ## 123         major       investee   23
            ## 124        equity     investment   22
            ## 125       pension           fund   22
            ## 126        -share        earning   22
            ## 127         major            way   22
            ## 128        actual          owner   22
            ## 129       nominee           name   22
            ## 130          many         decade   22
            ## 131       capital    expenditure   22
            ## 132           key         figure   22
            ## 133   shareholder       discount   22
            ## 134       special        pricing   22
            ## 135          long           term   21
            ## 136        worker   compensation   21
            ## 137      casualty      insurance   21
            ## 138   reinsurance       business   21
            ## 139           few           case   21
            ## 140      economic          value   21
            ## 141      negative         return   21
            ## 142         other        company   21
            ## 143           new    shareholder   21
            ## 144         stock         option   21
            ## 145      specific       business   21
            ## 146          huge         amount   21
            ## 147         other       respects   21
            ## 148        report          apply   21
            ## 149          sees        candies   20
            ## 150        entire       business   20
            ## 151     insurance        manager   20
            ## 152          past         decade   20
            ## 153      positive         return   20
            ## 154          past         report   20
            ## 155          past           year   20
            ## 156      eligible          share   20
            ## 157        future        program   20
            ## 158       federal         income   20
            ## 159    percentage         change   20
            ## 160          wide        variety   20
            ## 161          many        company   20
            ## 162          cash     equivalent   20
            ## 163     Financial        product   20
            ## 164         other          event   20
            ## 165           top      counselor   20
            ## 166        annual     percentage   20
            ## 167      calendar           year   20
            ## 168       utility      operation   20
            ## 169         other     intangible   20
            ## 170          cost         market   19
            ## 171      economic        reality   19
            ## 172           tax          basis   19
            ## 173          good         reason   19
            ## 174          term           bond   19
            ## 175          last            few   19
            ## 176          next            few   19
            ## 177     corporate            tax   19
            ## 178          next         decade   19
            ## 179         other           time   19
            ## 180           bad           news   19
            ## 181         price     accounting   19
            ## 182         other          asset   19
            ## 183     insurance          world   19
            ## 184          auto        insurer   19
            ## 185          main           gala   19
            ## 186        equity      securitie   19
            ## 187   corporation           such   19
            ## 188   appropriate            tax   19
            ## 189     aggregate            lag   19
            ## 190      business          model   19
            ## 191         hedge           fund   19
            ## 192     insurance     subsidiary   18
            ## 193     excellent       business   18
            ## 194         small       fraction   18
            ## 195         small        portion   18
            ## 196        square           foot   18
            ## 197      economic       interest   18
            ## 198        future           year   18
            ## 199      american    corporation   18
            ## 200      business         school   18
#Simple noun phrases (a adjective+noun, pre/postposition, optional determiner and another adjective+noun)
            anno_words$phrase_tag <- as_phrasemachine(anno_words$upos, type = "upos")
            
            stats <- keywords_phrases(x = anno_words$phrase_tag, term = anno_words$token, 
                                      pattern = "(A|N)+N(P+D*(A|N)*N)*", 
                                      is_regex = TRUE, ngram_max = 8, detailed = FALSE)
            
            head(stats,100)
##                      keyword ngram freq
            ## 1                  last year     2  365
            ## 2                  Last year     2  248
            ## 3                 book value     2  241
            ## 4            intrinsic value     2  236
            ## 5                   which we     2  188
            ## 6                    that we     2  170
            ## 7              our insurance     2  157
            ## 8                  net worth     2  152
            ## 9               market value     2  129
            ## 10            business value     2  128
            ## 11        Berkshire Hathaway     2  121
            ## 12            annual meeting     2  120
            ## 13        insurance business     2  116
            ## 14                 few years     2  115
            ## 15             balance sheet     2  114
            ## 16              Scott Fetzer     2  103
            ## 17       underwriting profit     2  102
            ## 18              recent years     2  102
            ## 19          our shareholders     2   98
            ## 20            Furniture Mart     2   98
            ## 21             capital gains     2   96
            ## 22        Nebraska Furniture     2   94
            ## 23                   what we     2   92
            ## 24        operating earnings     2   90
            ## 25   Nebraska Furniture Mart     3   89
            ## 26       insurance companies     2   87
            ## 27          American Express     2   87
            ## 28                   year we     2   86
            ## 29                 our float     2   83
            ## 30                 Blue Chip     2   82
            ## 31                many years     2   82
            ## 32             annual report     2   80
            ## 33        intrinsic business     2   80
            ## 34               Wells Fargo     2   80
            ## 35              common stock     2   79
            ## 36           Washington Post     2   77
            ## 37                    SP 500     2   77
            ## 38  intrinsic business value     3   76
            ## 39                last years     2   75
            ## 40                General Re     2   74
            ## 41                    that I     2   71
            ## 42          Berkshire shares     2   68
            ## 43                 those who     2   66
            ## 44                   10 a.m.     2   66
            ## 45              stock market     2   65
            ## 46             business that     2   65
            ## 47    Berkshire shareholders     2   64
            ## 48           businesses that     2   64
            ## 49            our businesses     2   64
            ## 50               Wall Street     2   63
            ## 51                    Mrs. B     2   63
            ## 52       insurance operation     2   62
            ## 53              our purchase     2   62
            ## 54                  New York     2   61
            ## 55            companies that     2   60
            ## 56                which they     2   59
            ## 57                 ten years     2   59
            ## 58             our operating     2   57
            ## 59        National Indemnity     2   57
            ## 60      financial statements     2   57
            ## 61         underwriting loss     2   57
            ## 62               -share book     2   57
            ## 63                    6 p.m.     2   57
            ## 64      insurance operations     2   56
            ## 65         -share book value     3   56
            ## 66            premium volume     2   53
            ## 67             earning power     2   52
            ## 68                World Book     2   52
            ## 69        insurance industry     2   51
            ## 70                 what they     2   51
            ## 71                five years     2   51
            ## 72           tangible assets     2   51
            ## 73        financial strength     2   50
            ## 74              our managers     2   50
            ## 75   Washington Post Company     3   49
            ## 76              Post Company     2   49
            ## 77           our shareholder     2   49
            ## 78          pre-tax earnings     2   49
            ## 79               real estate     2   48
            ## 80              Buffalo News     2   48
            ## 81              managers who     2   47
            ## 82               other words     2   47
            ## 83                    year I     2   47
            ## 84    Berkshire Hathaway Inc     3   46
            ## 85              Hathaway Inc     2   46
            ## 86         investment income     2   46
            ## 87              our holdings     2   46
            ## 88                years that     2   46
            ## 89              Last year we     3   46
            ## 90                 Warren E.     2   45
            ## 91         Warren E. Buffett     3   45
            ## 92                E. Buffett     2   45
            ## 93              HATHAWAY INC     2   43
            ## 94                  our cost     2   43
            ## 95             our ownership     2   43
            ## 96             earnings that     2   43
            ## 97        present management     2   43
            ## 98            Charlie Munger     2   43
            ## 99          their businesses     2   43
            ## 100           equity capital     2   42
#most occuring adjectives
            library(lattice)
            stats <- subset(anno_words, upos %in% c("ADJ")) 
            stats <- txt_freq(stats$token)
            stats$key <- factor(stats$key, levels = rev(stats$key))
            barchart(key ~ freq, data = head(stats, 30), col = "cadetblue", 
                     main = "Most occurring adjectives", xlab = "Freq")

library(lattice)
            stats <- subset(anno_words, upos %in% c("ADV")) 
            stats <- txt_freq(stats$token)
            stats$key <- factor(stats$key, levels = rev(stats$key))
            barchart(key ~ freq, data = head(stats, 30), col = "cadetblue", 
                     main = "Most occurring adverbs", xlab = "Freq")

#finding key words
            stats <- keywords_rake(x = anno_words, term = "lemma", group = "doc_id", 
                                   relevant = anno_words$upos %in% c("NOUN", "ADJ"))
            stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
            barchart(key ~ rake, data = head(subset(stats, doc_id="doc1"), 20), col = "cadetblue", 
                     main = "Keywords identified by RAKE", 
                     xlab = "Rake")

anno_words$word <- tolower(anno_words$token)
            stats <- keywords_collocation(x = anno_words, term = "word", group = "doc_id")
            stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
            barchart(key ~ pmi, data = head(subset(stats, freq > 10), 20), col = "cadetblue", 
                     main = "Keywords identified by PMI Collocation", 
                     xlab = "PMI (Pointwise Mutual Information)")