Use R to read PDF files and perform data mining. An example is as follows:
# Here is a PDF for miningurl <-"http://www.noisyroom.net/blog/RomneySpeech072912.pdf" DEST <-tempfile (fileext = ". PDF ") download. file (URL, DEST, mode = "WB") # Set path to convert totxt.exe and Convert PDF to textexe <-"C: \ Program Files \ xpdfbin-win-3.03 \ bin32 \ export totext.exe "system (paste (" \ "", EXE, "\" \ "", DEST ,"\"", SEP = ""), wait = f) # Get TXT-file name and open itfiletxt <-sub (". PDF ",". TXT ", Dest)shell.exe C (filetxt); shell.exe C (filetxt) # strangely the first try always throws an error .. # Do something with it, I. e. A simple word cloudlibrary (TM) Library (wordcloud) Library (rstem) TXT <-readlines (filetxt) # Don't mind warning..txt <-tolower (txt) TXT <-removewords (txt, C ("\ f", stopwords () corpus <-Corpus (vectorsource (txt) corpus <-tm_map (corpus, removepunctuation) TDM <-termdocumentmat Rix (corpus) m <-. matrix (TDM) d <-data. frame (freq = sort (rowsums (M), decreasing = true) # stem wordsd $ stem <-wordstem (row. names (D), language = "English") # And put words to column, otherwise they wocould be lost when aggregatingd $ word <-row. names (d) # Remove Web address (very long string): d <-d [nchar (row. names (D) <20,] # aggregate freqeuncy by Word Stem and # Keep first words .. agg_freq <-aggrega Te (freq ~ Stem, Data = D, sum) agg_word <-aggregate (word ~ Stem, Data = D, function (x) x [1]) d <-cbind (freq = agg_freq [, 2], agg_word) # sort by frequencyd <-d [Order (d $ freq, decreasing = T),] # print wordcloud: wordcloud (d $ word, d $ freq) # Remove filesfile. remove (Dir (tempdir (), full. name = t) # Remove files