Use NLTK to clean text, indexing tool
en_whitelist = ' 0123456789abcdefghijklmnopqrstuvwxyz ' # included in WHITELIST en_blacklist = '! ' #$%&\ ' () *+,-./:;<=>?@[\\]^_ ' {|} ~\ ' FILENAME = ' data/chat.txt ' limit = {' MAXQ ': +, ' minq ': 0, ' Maxa ': ' Mina ': 3} UNK = ' UNK ' vocab_size = 6000 import ra Ndom Import sys import NLTK import itertools from collections import defaultdict import NumPy as NP import pickle def ddefault () : Return 1 ' "read lines from file return [list of lines]" def read_lines (filename): return open (filename). read (). Split (' n ') [: -1] ' ' split sentences in one line into listbox lines return [list of lines] ' Def split_line (line): Return Line.split ( '.') ' Remove anything that isn ' t in the vocabulary return str (pure ta/en) ' Def filter_line (line, whitelist): return '. Join ([ CH for-ch in line if ch in whitelist]) "' Read list of words, CREATE INDEX to Word, Word to index dictionaries return tuple (vocab -> (Word, count), idx2w, W2idx) ' Def index_ (Tokenized_sentences, VOCab_size): # get frequency distribution freq_dist = NLTK. Freqdist (Itertools.chain (*tokenized_sentences)) # Get vocabulary of ' vocab_size ' most used words-vocab = Freq_dist.most_ Common (vocab_size) # Index2word Index2word = [' _ '] + [UNK] + [x[0] for x in vocab] # Word2index word2index = Dict ([(w,i) for i,w I N Enumerate (Index2word)]) return Index2word, Word2index, Freq_dist ' "filter too long and too short sequences return tuple ( Filtered_ta, Filtered_en) ' Def filter_data (sequences): filtered_q, filtered_a = [], [] Raw_data_len = Len (sequences) executes For I in range (0, Len (sequences), 2): Qlen, alen = Len (Sequences[i].split (")), Len (Sequences[i+1].split (")) if Qlen >= limit[' Minq '] and Qlen <= limit[' maxq ': If Alen >= ' Mina '] and limit[alen <= ' limit[']: Maxa ( Sequences[i]) filtered_a.append (sequences[i+1]) # Print the fraction of the original data, filtered Filt_data_len = Len ( FILTERED_Q) filtered = Int ((raw_data_len-filt_data_len) *100/raw_data_len)Print (str (filtered) + '% filtered from original data ') return filtered_q, filtered_a ' Create the final dataset:-Convert List of items to arrays of Indices-add zero padding return ([Array_en ([indices]), ARRAY_TA ([indices])] ' Def zero_pad ( Qtokenized, Atokenized, W2IDX): # num of rows Data_len = Len (qtokenized) # NumPy arrays to store indices idx_q = Np.zeros ([data_ Len, limit[' MaxQ ']], dtype=np.int32) idx_a = Np.zeros ([Data_len, limit[' Maxa ']], dtype=np.int32) for I in range (Data_len) : Q_indices = Pad_seq (Qtokenized[i], w2idx, limit[' MAXQ ']) a_indices = Pad_seq (atokenized[i), W2idx, limit[' Maxa ']) Print (len (idx_q[i)), Len (q_indices)) #print (Len (idx_a[i), Len (a_indices)) idx_q[i] = Np.array (q_indices) idx_a[i] = Np.array (a_indices) return idx_q, idx_a "replace words with indices in a sequence replace with unknown if Word not in lookup return [list of indices] ' def pad_seq (seq, Lookup, maxlen): indices = [] for word in Seq:if word in lookup:indices.append ( Lookup[word]Else:indices.append (Lookup[unk]) return indices + [0]* (Maxlen-len (seq)) def process_data (): Print (' \n>> Read lines from file ') lines = Read_lines (filename=filename) # Change to lower case (ethically for en) lines = [Line.lower () for line in Lin ES] Print (' \ n:: Sample from Read (p) lines ') print (lines[121:125]) # filter out unnecessary characters print (' \n>> Filter lines ') lines = [Filter_line (line, en_whitelist) for line in lines] print (lines[121:125]) # filter out too long or too Short sequences print (' \n>> 2nd layer of filtering ') qlines, alines = Filter_data (lines) print (' \nq: {0}; A: {1} '. Format (qlines[60], alines[60]) print (' \nq: {0}; A: {1} '. Format (qlines[61], alines[61])) # Convert List of [lines of text] into List of [list of words] print (' \n>> Segment lines to words ') qtokenized = [Wordlist.split (') for wordlist in Qlines] atokenized = [Wordlist.split (') for wordlist in Alines] print (' \ n:: Sample from segmented list of words ') print (' \nq: {0}; A: {1} '. Format (qtokenized[60], atokenized[60]) print (' \nq: {0}; A: {1} '. Format (qtokenized[61), atokenized[61]) Indexing-> idx2w, w2idx:en/ta print (' \ n >> Index words ') idx2w, w2idx, freq_dist = index_ (qtokenized + atokenized, vocab_size=vocab_size) print (' \ n >> Zero Padding ') idx_q, idx_a = Zero_pad (qtokenized, atokenized, w2idx) print (' \ n >> save NumPy arrays to disk '] # save them np.save (' idx_q.npy ', idx_q) np.save (' Idx_a.npy ', idx_a) # Let us now Save the Necessary dictionaries metadata = {' W2idx ': w2idx, ' idx2w ': idx2w, ' limit ': limit, ' freq_dist ': freq_dist} # Write to Disk:d ATA Control dictionaries with open (' Metadata.pkl ', ' WB ') as F:pickle.dump (metadata, F) def load_data (Path= '): # Read Data Control dictionaries with open (PATH + ' metadata.pkl ', ' RB ') as F:metadata = Pickle.load (f) # read NumPy arrays = Idx_ta (Path + ' idx_q.npy ') idx_en = np.load (path + ' idx_a.npy ') return metadata, Idx_q, idx_a if __name__ = = ' __mAin__ ': Process_data ()