qq:231469242 Original
Single PDF Content Extraction
#-*-Coding:utf-8-*-"" "Io.open () is the preferred, higher-level interface to file I/O. It wraps the Os-level file descriptor in an object so you can use to access the file in a pythonic Manner.os.open () is J UST a wrapper for the Lower-level POSIX syscall. It takes less symbolic (and more posix-y) arguments, and returns the file descriptor (a number) that represents the opened File. It does not return a file object; The returned value would not have read () or write () methods. "" Import refrom pdfminer.pdfinterp import Pdfresourcemanager, process_pdffrom pdfminer.converter Import Textconverterfrom pdfminer.layout Import laparams # PIP3 install pdfminer3k from IO import stringiofrom io import open#pdf FileName pdffilename= "avelumab.pdf" #文件名前缀frontName = "usan/2016/" #商标文件名 trademark_filename= "Trademarks.txt" # Sponsor filename sponsor_filename= "Sponsor.txt" #读取PDF数据 def readpdf (pdffile): Rsrcmgr = Pdfresourcemanager () Retstr = Stringio () Laparams = Laparams () device = Textconverter (Rsrcmgr, RETSTR, Laparams=laparams) process_pdf (rsrcmgr, Device, Pdffile) device.close () content = Retstr.getvalue () retstr. Close () return content #规范PDF数据def Format (str1): list2=[] #分割成列表 list1=str1.split ("\ n") for I in list 1: #if i== "/n": If i== "or i==" or i== ": Continue List2.append (i) return List2 #提取me_usan, drug name Def Get_me_usan (the_list_data): return the_list_data[0] #提取me_therapeuticdef get_me_therap Eutic (The_list_data): For I in the_list_data:if "treatment of" in I:return I #提取me_chemi Cal1 Molecular formula 1def Get_me_chemical1 (the_list_data): For I in The_list_data:if "1." In I:return I Retu RN "" #提取me_chemical2 Molecular formula 2def Get_me_chemical2 (the_list_data): For I in The_list_data:if "2." In I: return I return "" #匹配分子式def Re_formula (str1): #匹配正在表达式 re_formula=re.compile (R ' C (\d) +h (\d) + ') Mo1=re_formula.search (StR1) If Mo1!=none:return True return false# extract Me_mo_formula, characteristic contains carbon hydrogen ch element def get_me_mo_formula (The_list_data): For i in The_list_data: #转换为大写 i=i.upper () value=re_formula (i) if Value==true:ret Urn I return "" #提取分子质量me_mo_weight, if molecular weight is present, and the next value is a number or floating point, the next value, Def Get_me_mo_weight, is extracted (the_li St_data): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value if ' MOLECULAR WEIGHT ' in The_list_data[count]: value=the_list_data[count+1] If Type (eval (value)) = = int or type (eval (value)) = = Float:return Value return "" #从trademarks. txt search Data def get_txt_contents (filename): File=ope N (filename) content=file.readlines () content1=[i.replace ("\ n", "") for I in content] return Content1 #提取me _trademark, search data from Trademarks.txt def get_me_trademark (the_list_data): For I in The_list_data:i=i.strip ("") For K in List_trademarkS:if k in I:return I return "" #提取me_sponsor, search data from Sponsor.txt def get_me_sponsor (the_list _data): For I in The_list_data:i=i.strip ("") for K in List_sponsors:if K in I: return I return "" #匹配CAS正则表达式def Re_cas (str1): Re_cas=re.compile (R ' (\d) +-(\d) +-(\d) + ') Mo1=re_cas.search (str1) If Mo1!=none:return True return False #提取CASdef Get_cas (the_list_data): For I in The_list_data: Value=re_cas (i) If Value==true:return I return "" #匹配WHO正则表达式def re_who (str1): R E_who=re.compile (R ' (\d) + ') Mo1=re_who.search (str1) if Mo1!=none:return True return False #提取WHOdef get_w HO (The_list_data): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value in the try: If ' Who number ' in The_list_data[count]: value=the_list_data[count+1] If Type (eval (value)) = = Int: return value Except:return "" Return "#匹配UNII正则表达式def re_unii (str1): #{10} indicates 10 occurrences Re_u Nii=re.compile (R ' [a-za-z0-9]{10} ') Mo1=re_unii.search (str1) if Mo1!=none:return True return false# extract Uniid EF get_unii (the_list_data): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value if ' UN II ' in The_list_data[count]: value=the_list_data[count+1] If re_unii (value) ==true:re Turn value return "" #获取me_down数据def Get_me_down (the_list_data): Name=frontname+pdffilename return name PDF File = open (Pdffilename, ' rb ') outputstring = Readpdf (pdffile) List_data=format (outputstring) me_source=2016# extract Me_usan , Drug name Me_usan=get_me_usan (list_data) #提取me_therapeutic cure disease me_therapeutic=get_me_therapeutic (list_data) #提取me_ Therapeuticme_chemical1=get_me_chemical1 (List_data) #提取me_chemical2 Molecular formula 2me_chemical2=get_me_chemical2 (List_data) #提取me_mo_formula, characteristic contains hydrocarbon ch element me_mo_formula=get_me_mo_formula(list_data) #提取分子质量me_mo_weight #me_mo_weight=get_me_mo_weight (list_data) #商标名数据库list_trademarks =get_txt_contents (trademark_filename) #提取商标名me_trademark =get_me_trademark (list_data) #赞助商数据库list_sponsors =get_txt_contents ( Sponsor_filename) #提取赞助商, the new company could not find Me_sponsor=get_me_sponsor (list_data) #提取CASme_CAS =get_cas (list_data) #提取WHOme_WHO =get_who (List_data) #提取UNIIme_UNII =get_unii (list_data) #获取me_downme_down =get_me_down (list_data) #me_ Bianma data defaults to null me_bianma= "" #me_ylbm数据默认为空me_ylbm = "" me_mo_weight= ""
Multiple PDF Content Extraction
#-*-Coding:utf-8-*-"" "Created on Tue Dec 27 11:37:54 2016 batch extract PDF data into Excel" "" Import reimport osimport pandas,csvfrom PD Fminer.pdfinterp import Pdfresourcemanager, process_pdffrom pdfminer.converter import Textconverterfrom Pdfminer.layout Import Laparams # PIP3 install pdfminer3k from IO import stringiofrom io import open# get directory within file name List_filename S=os.listdir () #获取所有PDF文件名list_pdfFilename =[i for I in List_filenames if ". pdf" in I] #pdf文件名, tested with #pdffilename= " Cenobamate.pdf "#参数设置 # filename prefix frontname=" usan/2016/"#me_sorce字段me_source =2016#me_bianma data defaults to null me_bianma=" "#me_ YLBM data defaults to null me_ylbm= "" me_code= "" me_en= "" #me_mo_weight = "" #疾病诊断文件名 therapeutic_filename= "Therapeutic.txt" #商标文件名 Trademark_filename= "Trademarks.txt" #赞助商文件名sponsor_filename = "Sponsor.txt" #读取PDF数据 def readpdf (pdffile): Rsrcmgr = Pdfresourcemanager () Retstr = Stringio () Laparams = Laparams () device = Textconverter (Rsrcmgr, Retstr, laparams=l Aparams) process_pdf (rsrcmgr, Device, Pdffile) device.close () content = Retstr.getValue () Retstr.close () return content #规范PDF数据def Format (str1): list2=[] Re_blank=re.compile (R ' \s ') # Split into List list1=str1.split ("\ n") for I in List1:m=re_blank.search (i) #遇到空格 if M==none: Continue List2.append (i) return list2 #提取me_usan, drug name, not hundred percent accurate def get_me_usan (pdffilename): me_usan= Pdffilename.replace (". pdf", "") Return Me_usan #提取me_therapeuticdef get_me_therapeutic (the_list_data): For I I N the_list_data: #小写 i=i.lower () for K in List_therapeutic:if K in I:retu RN I return "" #提取me_chemical1 Molecular formula 1def Get_me_chemical1 (the_list_data): For I in The_list_data:if "1. "In I:return I Return" "#提取me_chemical2 Molecular formula 2def Get_me_chemical2 (the_list_data): For I in The_list_data:if "2." In I:return I Return "" #匹配分子式def Re_formula (str1): #匹配正在表达式 re_for Mula=re.compile (R ' C (\d) +H (\d) + ') Mo1=re_formula.search (str1) if Mo1!=none:return True return false# extract Me_mo_formula, characteristic contains carbon hydrogen ch element def Get_me_mo_formula (The_list_data): For i in The_list_data: #转换为大写 i=i.upper () Value=re_formula (i) If Value==true:return I return "" #提取分子质量me_mo_weight if molecular weight appears, and the next value is a number or floating point Number, the next value is extracted def get_me_mo_weight (the_list_data): For count in range (len (the_list_data)): #如果出现MOLECULAR weight, extract Next value if ' MOLECULAR WEIGHT ' in The_list_data[count]: value=the_list_data[count+1] If "kDa" in Value:return value Try:if Type (eval (value)) = = int or type (eval (value)) = = Floa T:return value Except:return "" return "#从trademarks. TXT search Data def Get_txt_contents (filename): file=open (filename) content=file.readlines () content1=[i.replace ("\ n", "") for I in C Ontent] #转换为小写 conTent2=[i.lower () for I in Content1] return content2 #提取me_trademark, search data from Trademarks.txt def Get_me_trademark (the _list_data): For I in The_list_data:i=i.strip ("") for K in List_trademarks:if K in I: return I return "" #提取me_sponsor, search data from Sponsor.txt def get_me_sponsor (the_list_data): For I in The_list_d Ata:i=i.strip ("") for K in List_sponsors:if K in I:return I return "" #匹配CAS正则表达式def Re_cas (str1): Re_cas=re.compile (R ' (\d) +-(\d) +-(\d) + ') Mo1=re_cas.search (str1) if Mo1!=none: Return True return False #提取CASdef Get_cas (the_list_data): For I in The_list_data:value=re_cas (i) I F Value==true:return I return "" #匹配WHO正则表达式def re_who (str1): Re_who=re.compile (R ' (\d) + ') Mo1=re_who.search (str1) if Mo1!=none:return True return False #提取WHO # extract Whodef get_who (the_list_data): fo R count in range (Len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value try:if ' who number ' in The_list_data[count]: VALUE=THE_LIST_DATA[COUNT+1] If Type (eval (value)) = = Int:return Value E Xcept:return "" "Return" "#匹配UNII正则表达式def re_unii (str1): #{10} indicates 10 times re_unii=re.compile (R ' [a-za-z 0-9]{10} ') Mo1=re_unii.search (str1) if Mo1!=none:return True return false# extract uniidef get_unii (the_list_dat A): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, then extract the next value if ' UNII ' in The_list_data[cou NT]: value=the_list_data[count+1] If re_unii (value) ==true:return value return "" # Get Me_down Data def get_me_down (pdffilename): Name=frontname+pdffilename return name# get a single pdf of the drug name, encoding, who,unii, molecular and other content def Get_one_pdf_content (Pdffilename,count): list_one_pdf_content=[] pdffile = open (Pdffilename, ' rb ') outputString = Readpdf (Pdffile) list_Data=format (outputstring) #me_uid me_uid=count #提取me_usan, drug name Me_usan=get_me_usan (pdffilename) #提取me_th Erapeutic cure disease, use dictionary method to rewrite me_therapeutic=get_me_therapeutic (list_data) #提取me_therapeutic Me_chemical1=get_me_chemica L1 (List_data) #提取me_chemical2 Molecular Formula 2 Me_chemical2=get_me_chemical2 (list_data) #提取me_mo_formula, characterized by hydrocarbon ch element me_mo_ Formula=get_me_mo_formula (List_data) #提取分子质量me_mo_weight, there are problems that need to be improved me_mo_weight=get_me_mo_weight (List_data) #提取商标名 Me_trademark=get_me_trademark (List_data) #提取赞助商, the new company will not find Me_sponsor=get_me_sponsor (list_data) #提取CAS me_cas= Get_cas (List_data) #提取WHO me_who=get_who (list_data) #提取UNII me_unii=get_unii (list_data) #获取me_down Me_do Wn=get_me_down (pdffilename) #把所有内容添加进去 list_one_pdf_content.append (me_uid) list_one_pdf_content.append (Me_sou RCE) list_one_pdf_content.append (Me_usan) list_one_pdf_content.append (me_therapeutic) list_one_pdf_content.append (ME_CHEMICAL1) List_oNe_pdf_content.append (Me_chemical2) list_one_pdf_content.append (Me_mo_formula) list_one_pdf_content.append (me_mo _weight) list_one_pdf_content.append (Me_trademark) list_one_pdf_content.append (me_sponsor) list_one_pdf_content.a Ppend (Me_code) list_one_pdf_content.append (Me_cas) list_one_pdf_content.append (me_who) List_one_pdf_content.appen D (me_unii) list_one_pdf_content.append (me_en) list_one_pdf_content.append (Me_down) list_one_pdf_content.append (Me _bianma) List_one_pdf_content.append (ME_YLBM) return list_one_pdf_content# get all the PDF drug names, codes, who,unii, molecular and other content def get_ All_pdf_content (list_pdffilename): #添加首行 list_all_pdfcontent.append (List_firstrow) for count in range (Len (list_pd Ffilename)): Filename=list_pdffilename[count] Try:list_one_pdf_content=get_one_pdf_content (filen Ame,count) except:list_one_pdf_content= "" List_all_pdfcontent.append (List_one_pdf_co ntent) return list_all_pdfcontent# First line information list_firstrow=["Me_uid", "Me_source", "Me_usan", "me_therapeutic", "Me_chemical1", "Me_chemical2", "Me _mo_formula "," Me_mo_weight "," Me_trademark "," Me_sponsor "," Me_codename "," Me_cas "," me_who "," me_unii "," Me_en "," me_ Down "," Me_bianma "," ME_YLBM "] #治疗疾病数据库list_therapeutic =get_txt_contents (therapeutic_filename) #商标名数据库list_ Trademarks=get_txt_contents (trademark_filename) #赞助商数据库list_sponsors =get_txt_contents (sponsor_filename) # Get all the names of all PDFs, codes, who,unii, List_all_pdfcontent=[]list_all_pdfcontent=get_all_pdf_content (list_pdffilename) Csvobj=open ("Output.csv", ' W ', newline= ') Csvwriter=csv.writer (csvobj) for RowData in List_all_pdfcontent: Csvwriter.writerow (RowData) csvobj.close ()
Requires a database
Auto output results
United States Usan Database--pdf extract