United States Usan Database--pdf extract

Source: Internet
Author: User
Tags eval posix

qq:231469242 Original

Single PDF Content Extraction

#-*-Coding:utf-8-*-"" "Io.open () is the preferred, higher-level interface to file I/O. It wraps the Os-level file descriptor in an object so you can use to access the file in a pythonic Manner.os.open () is J UST a wrapper for the Lower-level POSIX syscall.  It takes less symbolic (and more posix-y) arguments, and returns the file descriptor (a number) that represents the opened File. It does not return a file object; The returned value would not have read () or write () methods. "" Import refrom pdfminer.pdfinterp import Pdfresourcemanager, process_pdffrom pdfminer.converter Import Textconverterfrom pdfminer.layout Import laparams # PIP3 install pdfminer3k from IO import stringiofrom io import open#pdf FileName pdffilename= "avelumab.pdf" #文件名前缀frontName = "usan/2016/" #商标文件名 trademark_filename= "Trademarks.txt" # Sponsor filename sponsor_filename= "Sponsor.txt" #读取PDF数据 def readpdf (pdffile): Rsrcmgr = Pdfresourcemanager () Retstr = Stringio () Laparams = Laparams () device = Textconverter (Rsrcmgr, RETSTR, Laparams=laparams) process_pdf (rsrcmgr, Device, Pdffile) device.close () content = Retstr.getvalue () retstr. Close () return content #规范PDF数据def Format (str1): list2=[] #分割成列表 list1=str1.split ("\ n") for I in list         1: #if i== "/n": If i== "or i==" or i== ": Continue List2.append (i) return List2 #提取me_usan, drug name Def Get_me_usan (the_list_data): return the_list_data[0] #提取me_therapeuticdef get_me_therap Eutic (The_list_data): For I in the_list_data:if "treatment of" in I:return I #提取me_chemi Cal1 Molecular formula 1def Get_me_chemical1 (the_list_data): For I in The_list_data:if "1." In I:return I Retu             RN "" #提取me_chemical2 Molecular formula 2def Get_me_chemical2 (the_list_data): For I in The_list_data:if "2." In I:    return I return "" #匹配分子式def Re_formula (str1): #匹配正在表达式 re_formula=re.compile (R ' C (\d) +h (\d) + ') Mo1=re_formula.search (StR1) If Mo1!=none:return True return false# extract Me_mo_formula, characteristic contains carbon hydrogen ch element def get_me_mo_formula (The_list_data): For i in The_list_data: #转换为大写 i=i.upper () value=re_formula (i) if Value==true:ret Urn I return "" #提取分子质量me_mo_weight, if molecular weight is present, and the next value is a number or floating point, the next value, Def Get_me_mo_weight, is extracted (the_li  St_data): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value if ' MOLECULAR WEIGHT ' in  The_list_data[count]: value=the_list_data[count+1] If Type (eval (value)) = = int or type (eval (value)) = = Float:return Value return "" #从trademarks. txt search Data def get_txt_contents (filename): File=ope N (filename) content=file.readlines () content1=[i.replace ("\ n", "") for I in content] return Content1 #提取me        _trademark, search data from Trademarks.txt def get_me_trademark (the_list_data): For I in The_list_data:i=i.strip ("") For K in List_trademarkS:if k in I:return I return "" #提取me_sponsor, search data from Sponsor.txt def get_me_sponsor (the_list                _data): For I in The_list_data:i=i.strip ("") for K in List_sponsors:if K in I: return I return "" #匹配CAS正则表达式def Re_cas (str1): Re_cas=re.compile (R ' (\d) +-(\d) +-(\d) + ') Mo1=re_cas.search        (str1) If Mo1!=none:return True return False #提取CASdef Get_cas (the_list_data): For I in The_list_data: Value=re_cas (i) If Value==true:return I return "" #匹配WHO正则表达式def re_who (str1): R E_who=re.compile (R ' (\d) + ') Mo1=re_who.search (str1) if Mo1!=none:return True return False #提取WHOdef get_w            HO (The_list_data): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value in the try: If ' Who number ' in The_list_data[count]: value=the_list_data[count+1] If Type (eval (value))                 = = Int:   return value Except:return "" Return "#匹配UNII正则表达式def re_unii (str1): #{10} indicates 10 occurrences Re_u Nii=re.compile (R ' [a-za-z0-9]{10} ') Mo1=re_unii.search (str1) if Mo1!=none:return True return false# extract Uniid EF get_unii (the_list_data): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value if ' UN II ' in The_list_data[count]: value=the_list_data[count+1] If re_unii (value) ==true:re Turn value return "" #获取me_down数据def Get_me_down (the_list_data): Name=frontname+pdffilename return name PDF File = open (Pdffilename, ' rb ') outputstring = Readpdf (pdffile) List_data=format (outputstring) me_source=2016# extract Me_usan , Drug name Me_usan=get_me_usan (list_data) #提取me_therapeutic cure disease me_therapeutic=get_me_therapeutic (list_data) #提取me_ Therapeuticme_chemical1=get_me_chemical1 (List_data) #提取me_chemical2 Molecular formula 2me_chemical2=get_me_chemical2 (List_data) #提取me_mo_formula, characteristic contains hydrocarbon ch element me_mo_formula=get_me_mo_formula(list_data) #提取分子质量me_mo_weight #me_mo_weight=get_me_mo_weight (list_data) #商标名数据库list_trademarks =get_txt_contents (trademark_filename) #提取商标名me_trademark =get_me_trademark (list_data) #赞助商数据库list_sponsors =get_txt_contents ( Sponsor_filename) #提取赞助商, the new company could not find Me_sponsor=get_me_sponsor (list_data) #提取CASme_CAS =get_cas (list_data) #提取WHOme_WHO =get_who (List_data) #提取UNIIme_UNII =get_unii (list_data) #获取me_downme_down =get_me_down (list_data) #me_ Bianma data defaults to null me_bianma= "" #me_ylbm数据默认为空me_ylbm = "" me_mo_weight= ""

Multiple PDF Content Extraction

#-*-Coding:utf-8-*-"" "Created on Tue Dec 27 11:37:54 2016 batch extract PDF data into Excel" "" Import reimport osimport pandas,csvfrom PD Fminer.pdfinterp import Pdfresourcemanager, process_pdffrom pdfminer.converter import Textconverterfrom Pdfminer.layout Import Laparams # PIP3 install pdfminer3k from IO import stringiofrom io import open# get directory within file name List_filename S=os.listdir () #获取所有PDF文件名list_pdfFilename =[i for I in List_filenames if ". pdf" in I] #pdf文件名, tested with #pdffilename= " Cenobamate.pdf "#参数设置 # filename prefix frontname=" usan/2016/"#me_sorce字段me_source =2016#me_bianma data defaults to null me_bianma=" "#me_ YLBM data defaults to null me_ylbm= "" me_code= "" me_en= "" #me_mo_weight = "" #疾病诊断文件名 therapeutic_filename= "Therapeutic.txt" #商标文件名 Trademark_filename= "Trademarks.txt" #赞助商文件名sponsor_filename = "Sponsor.txt" #读取PDF数据 def readpdf (pdffile): Rsrcmgr = Pdfresourcemanager () Retstr = Stringio () Laparams = Laparams () device = Textconverter (Rsrcmgr, Retstr, laparams=l Aparams) process_pdf (rsrcmgr, Device, Pdffile) device.close () content = Retstr.getValue () Retstr.close () return content #规范PDF数据def Format (str1): list2=[] Re_blank=re.compile (R ' \s ') #            Split into List list1=str1.split ("\ n") for I in List1:m=re_blank.search (i) #遇到空格 if M==none: Continue List2.append (i) return list2 #提取me_usan, drug name, not hundred percent accurate def get_me_usan (pdffilename): me_usan= Pdffilename.replace (". pdf", "") Return Me_usan #提取me_therapeuticdef get_me_therapeutic (the_list_data): For I I N the_list_data: #小写 i=i.lower () for K in List_therapeutic:if K in I:retu  RN I return "" #提取me_chemical1 Molecular formula 1def Get_me_chemical1 (the_list_data): For I in The_list_data:if "1. "In I:return I Return" "#提取me_chemical2 Molecular formula 2def Get_me_chemical2 (the_list_data): For I in The_list_data:if "2." In I:return I Return "" #匹配分子式def Re_formula (str1): #匹配正在表达式 re_for Mula=re.compile (R ' C (\d) +H (\d) + ') Mo1=re_formula.search (str1) if Mo1!=none:return True return false# extract Me_mo_formula, characteristic contains carbon hydrogen ch element def        Get_me_mo_formula (The_list_data): For i in The_list_data: #转换为大写 i=i.upper () Value=re_formula (i) If Value==true:return I return "" #提取分子质量me_mo_weight if molecular weight appears, and the next value is a number or floating point Number, the next value is extracted def get_me_mo_weight (the_list_data): For count in range (len (the_list_data)): #如果出现MOLECULAR weight, extract Next value if ' MOLECULAR WEIGHT ' in The_list_data[count]: value=the_list_data[count+1] If "kDa" in Value:return value Try:if Type (eval (value)) = = int or type (eval (value)) = = Floa  T:return value Except:return "" return "#从trademarks. TXT search Data def Get_txt_contents (filename): file=open (filename) content=file.readlines () content1=[i.replace ("\ n", "") for I in C Ontent] #转换为小写 conTent2=[i.lower () for I in Content1] return content2 #提取me_trademark, search data from Trademarks.txt def Get_me_trademark (the                _list_data): For I in The_list_data:i=i.strip ("") for K in List_trademarks:if K in I: return I return "" #提取me_sponsor, search data from Sponsor.txt def get_me_sponsor (the_list_data): For I in The_list_d      Ata:i=i.strip ("") for K in List_sponsors:if K in I:return I return ""        #匹配CAS正则表达式def Re_cas (str1): Re_cas=re.compile (R ' (\d) +-(\d) +-(\d) + ') Mo1=re_cas.search (str1) if Mo1!=none: Return True return False #提取CASdef Get_cas (the_list_data): For I in The_list_data:value=re_cas (i) I    F Value==true:return I return "" #匹配WHO正则表达式def re_who (str1): Re_who=re.compile (R ' (\d) + ') Mo1=re_who.search (str1) if Mo1!=none:return True return False #提取WHO # extract Whodef get_who (the_list_data): fo R count in range (Len (the_list_data)): #如果出现MOLECULAR WEIGHT, extract the next value try:if ' who number ' in The_list_data[count]: VALUE=THE_LIST_DATA[COUNT+1] If Type (eval (value)) = = Int:return Value E Xcept:return "" "Return" "#匹配UNII正则表达式def re_unii (str1): #{10} indicates 10 times re_unii=re.compile (R ' [a-za-z 0-9]{10} ') Mo1=re_unii.search (str1) if Mo1!=none:return True return false# extract uniidef get_unii (the_list_dat A): For count in range (len (the_list_data)): #如果出现MOLECULAR WEIGHT, then extract the next value if ' UNII ' in The_list_data[cou NT]: value=the_list_data[count+1] If re_unii (value) ==true:return value return "" # Get Me_down Data def get_me_down (pdffilename): Name=frontname+pdffilename return name# get a single pdf of the drug name, encoding, who,unii, molecular and other content def Get_one_pdf_content (Pdffilename,count): list_one_pdf_content=[] pdffile = open (Pdffilename, ' rb ') outputString = Readpdf (Pdffile) list_Data=format (outputstring) #me_uid me_uid=count #提取me_usan, drug name Me_usan=get_me_usan (pdffilename) #提取me_th Erapeutic cure disease, use dictionary method to rewrite me_therapeutic=get_me_therapeutic (list_data) #提取me_therapeutic Me_chemical1=get_me_chemica L1 (List_data) #提取me_chemical2 Molecular Formula 2 Me_chemical2=get_me_chemical2 (list_data) #提取me_mo_formula, characterized by hydrocarbon ch element me_mo_    Formula=get_me_mo_formula (List_data) #提取分子质量me_mo_weight, there are problems that need to be improved me_mo_weight=get_me_mo_weight (List_data) #提取商标名 Me_trademark=get_me_trademark (List_data) #提取赞助商, the new company will not find Me_sponsor=get_me_sponsor (list_data) #提取CAS me_cas= Get_cas (List_data) #提取WHO me_who=get_who (list_data) #提取UNII me_unii=get_unii (list_data) #获取me_down Me_do Wn=get_me_down (pdffilename) #把所有内容添加进去 list_one_pdf_content.append (me_uid) list_one_pdf_content.append (Me_sou RCE) list_one_pdf_content.append (Me_usan) list_one_pdf_content.append (me_therapeutic) list_one_pdf_content.append (ME_CHEMICAL1) List_oNe_pdf_content.append (Me_chemical2) list_one_pdf_content.append (Me_mo_formula) list_one_pdf_content.append (me_mo _weight) list_one_pdf_content.append (Me_trademark) list_one_pdf_content.append (me_sponsor) list_one_pdf_content.a Ppend (Me_code) list_one_pdf_content.append (Me_cas) list_one_pdf_content.append (me_who) List_one_pdf_content.appen D (me_unii) list_one_pdf_content.append (me_en) list_one_pdf_content.append (Me_down) list_one_pdf_content.append (Me _bianma) List_one_pdf_content.append (ME_YLBM) return list_one_pdf_content# get all the PDF drug names, codes, who,unii, molecular and other content def get_ All_pdf_content (list_pdffilename): #添加首行 list_all_pdfcontent.append (List_firstrow) for count in range (Len (list_pd Ffilename)): Filename=list_pdffilename[count] Try:list_one_pdf_content=get_one_pdf_content (filen Ame,count) except:list_one_pdf_content= "" List_all_pdfcontent.append (List_one_pdf_co ntent) return list_all_pdfcontent# First line information list_firstrow=["Me_uid", "Me_source", "Me_usan", "me_therapeutic", "Me_chemical1", "Me_chemical2", "Me _mo_formula "," Me_mo_weight "," Me_trademark "," Me_sponsor "," Me_codename "," Me_cas "," me_who "," me_unii "," Me_en "," me_ Down "," Me_bianma "," ME_YLBM "] #治疗疾病数据库list_therapeutic =get_txt_contents (therapeutic_filename) #商标名数据库list_ Trademarks=get_txt_contents (trademark_filename) #赞助商数据库list_sponsors =get_txt_contents (sponsor_filename) # Get all the names of all PDFs, codes, who,unii, List_all_pdfcontent=[]list_all_pdfcontent=get_all_pdf_content (list_pdffilename)    Csvobj=open ("Output.csv", ' W ', newline= ') Csvwriter=csv.writer (csvobj) for RowData in List_all_pdfcontent: Csvwriter.writerow (RowData) csvobj.close ()

Requires a database

Auto output results

United States Usan Database--pdf extract

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.