python讀取word文檔,插入mysql資料庫執行個體

來源:互聯網
上載者:User

標籤:對象   esc   exe   pymysql   nec   word   ecif   img   find   

表格內容如下:

1、實現大量匯入word文檔,取文檔標題中的數字作為編號

2、除取上面打鉤的內容需要匹配出來入庫入庫,其他內容全部直接入庫mysql

# wuyanfeng
# -*- coding:utf-8 -*-
# 讀取docx中的文本程式碼範例
import docx
import pymysql
import re
import os

# 建立資料庫連結
conn = pymysql.connect(
host=‘rm-bp1vu5d84dg12c6d59o.mysql.rds.aliyuncs.com‘,
port=3306,
user=‘root‘,
passwd=‘wYf092415*‘,
db=‘pays‘,
charset=‘utf8‘,
)
# 建立遊標
cursor = conn.cursor()

#切片函數
def section(info,key,len11):
a = len(info)
print(a, type(a))
d = []
e = 0
g = -1
i = 0
task_class=[]
while i < len(info):
# for i in range(len(info)):
# i+=1
print("i::::", i)
try:
#c = info.index("a", e)
#print("c:::::", c)
c = info.index(key, e)
#print("c:::::", c)

print("c類型判斷",type(c))
except ValueError:
print(ValueError)
try:
if (c != ‘‘) & (g < int(c)):
d.append(c)
g = c
i = c + 1
print("illlldddd:", i)
e = c + 1
continue

elif (c == ‘‘):
break
except UnboundLocalError:
print(UnboundLocalError)

return task_class
break
print("d", d, type(d))
print(d[0], type(d[0]))
print("d的長度:",len(d))
#開始切片
if len(d) != 0:
for j in range(len(d)):
print("info11:::", info, type(info))
info = ‘‘.join(info)
print("info222:::",info,type(info))
print("d[%d]"%j,d[j])
#print("d[j]:5"%j,info[d[j]:5])

llll = info[d[j]+1:d[j]+5]
print("d[%d]:5" % j, llll)
task_class.append(llll)
print("task_class::11", task_class)

task_class=",".join(task_class)
print("str1112222",task_class)
return task_class


def insettable(file):
print("file::::::::::::::::::::", file)
print("type::::::::::::::::::::", type(file))
# file1 = file
# file1 = str(file1)
ddd = re.findall("知識庫\\\(\d+)", file)
print("ddd:::::::::::", ddd)
print("ddd[0]:::", ddd[0])
ddd = int(ddd[0])
print("ddd::::", type(ddd))

file = docx.Document(file)
# 讀取表格:
t = file.tables[0]
print(t)
print("1:", t.cell(0, 0).text) # 1
cell1 = t.cell(0, 0).text
print("tyep::::", type(t.cell(0, 0).text))

print("2:", t.cell(0, 1).text) # 2
cell2 = t.cell(0, 1).text

print("2:", t.cell(0, 2).text) # 2
cell3 = t.cell(0, 2).text

print("2:", t.cell(0, 3).text) # 2
cell4 = t.cell(0, 3).text
print("cell4:::::::::", cell4)

print("3:", t.cell(1, 0).text) # 3
cell5 = t.cell(1, 0).text

print("4:", t.cell(1, 2).text) # 4
cell6 = t.cell(1, 2).text

print("5:", t.cell(1, 3).text) # 5
task_type = t.cell(1, 3).text
# task_type = re.findall(‘.*[?√](.*)$‘, cell7)
# task_type = ‘‘.join(cell7)
print("task_type111111:", task_type)
# task_class = task_class[0:4]
‘‘‘低級處理方式
a = int(task_type.count("?"))
print("a|||||||", a, type(a))
b = int(task_type.count("√"))
print("b|||||||", b, type(a))
if (a == 1) | (b == 1):
print("111111111111111111")
# task_type = re.findall(‘.*[?√](.*)$‘, task_type)
task_type = re.findall(‘.*[?√](.*)$‘, task_type)
print("task_type1", task_type)
task_type = ‘‘.join(task_type)
print("task_type2", task_type)
task_type = task_type[0:4]
print("task_type3:d:%s,b=%d" % (a, b), task_type)
elif (a == 0) & (b == 0):
print("2222222222222222222")
task_type = ‘法定職責‘
print("a:%s,b=%s" % (a, b), task_type)
elif (a == 2) | (b == 2):
print("333333333333333333333")
task_type = ‘法定職責,工作職責 ‘
print("a:%s,b=%s" % (a, b), task_type)
‘‘‘
#調用切片函數
task_type1 = section(task_type, "√", 4)
task_type2 = section(task_type, "?", 4)
task_type1 = "".join(task_type1)
task_type2 = "".join(task_type2)
print("task_type1:::", task_type1,type(task_type1))
print("task_type2:::", task_type2,type(task_type2))
if task_type1.strip()!="":
task_type = task_type1
print("task_type111:::", task_type1)
elif task_type2.strip()!="":
task_type = task_type2
print("task_type222:::", task_type2)

print("6:", t.cell(1, 4).text) # 6
cell8 = t.cell(1, 4).text

print("7:", t.cell(2, 1).text) # 7
cell9 = t.cell(2, 1).text

# 擷取文檔對象
# file = docx.Document("D:\\配置庫\\公案APP\\1.2 系統規格\\知識庫\\14人員死亡先期處置.docx")
print("段落數:" + str(len(file.paragraphs))) # 段落數為13,每個斷行符號隔離一段
lenn = len(file.paragraphs)
print("len:", lenn)
# 輸出每一段的內容
for para in file.paragraphs:
print(para.text)

# 輸出段落編號及段落內容
for i in range(len(file.paragraphs)):
print("第" + str(i) + "段的內容是:" + file.paragraphs[i].text)

list6 = []
for i in range(len(file.paragraphs)):
if 0 == i:
print("i:", i)
lis0 = file.paragraphs[i].text
print("list0:", lis0)
print(type(lis0))

elif 1 == i:
print("i:", i)
task_class = file.paragraphs[i].text
print("lis1", task_class,type(task_class))
‘‘‘低級處理方式
print("task_class111111:", task_class)

c = int(task_class.count("?"))
task_class = ‘‘.join(task_class)
#print(task_class.index(‘?‘))
print("c|||||||", c, type(c))
d = int(task_class.count("√"))
print(task_class.index(‘√‘))

print("d|||||||", d, type(d))
task_class = re.findall(r‘[?√](?:.*)‘, task_class)
task_class = ‘‘.join(task_class)
task_class = task_class[1:5]
print("task_class", task_class)
‘‘‘
#調用切片函數
task_class1 = section(task_class, "√", 4)
task_class2 = section(task_class, "?", 4)
task_class1 = "".join(task_class1)
task_class2 = "".join(task_class2)
print("task_class1:::", task_class1,type(task_class1))
print("task_class2:::", task_class2,type(task_class2))
if task_class1.strip()!="":
task_class = task_class1
print("task_class11:::", task_class1)
elif task_class2.strip()!="":
task_class = task_class2
print("task_class22:::", task_class2)


if 2 == i:
print("i:", i)
lis2 = file.paragraphs[i].text

print("lis2", lis2)
print(type(lis2))
preparer = re.findall(‘填表單位:(.*?)$‘, lis2)
preparer = ‘‘.join(preparer)
print("preparer:%s" % preparer)

# elif 3 == i:
# print("i:", i)
# lis3 = file.paragraphs[i].text
elif 3 == i:
print("i:", i)
lis4 = file.paragraphs[i].text
print("lis4", lis4)
print(type(lis4))
elif 3 < i < lenn - 1:
print("i:", i)
print(file.paragraphs[i].text)
print(type(file.paragraphs[i].text))
# list6[i-5] = list6.append(file.paragraphs[i].text)
list6.append(str(file.paragraphs[i].text).strip(‘\xa0‘))
# list6.append("%s\n" % str(file.paragraphs[i].text).strip(‘\xa0‘))
print(list6)
key_steps = "\n".join(list6)
# print("key_steps:\n",key_steps.strip(‘\n‘))

cursor.execute(
"insert into `t_knowledge_base` (`no`, `preparer`, `task_class`, `task_name`, `task_specification`, `task_type`, `task_desc`, `task_basis`, `key_steps`) values (‘%d‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,NULL,‘%s‘)" % (
ddd, preparer, task_class, cell2, cell4, task_type, cell9, key_steps))
conn.commit()


def traverse(f):
fs = os.listdir(f)
for f1 in fs:
tmp_path = os.path.join(f, f1)
if not os.path.isdir(tmp_path):
print(‘檔案: %s‘ % tmp_path)
insettable(tmp_path)
else:
print(‘檔案夾:%s‘ % tmp_path)
traverse(tmp_path)


path = ‘D:\\配置庫\公案APP\\1.2 系統規格\\知識庫‘
traverse(path)

#單檔案調測
# path = ‘D:\\配置庫\\公案APP\\1.2 系統規格\\知識庫\\14人員死亡先期處置.docx‘
# insettable(path)

# 關閉遊標
cursor.close()
# 關閉串連
conn.close()

python讀取word文檔,插入mysql資料庫執行個體

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.