This article describes the multi-process + python3 processing MySQL data, the main logic is to pull the MySQL data, and then use the Flashtext match keyword, in the storage back to MySQL, the code is as follows (
async_mysql.py):
Import timeimport asyncioimport randomfrom concurrent.futures import processpoolexecutor as Poolimport Aiomysqlfrom Flashtext Import keywordprocessorimport clickclass attrdict (dict): "" "can be used". " Gets the Dictionary "" "Def __getattr__ (self, name): Try:return Self[name] except keyerror when the attribute is not returned without this property: Return None def __setattr__ (self, Name, value): self[name] = Valueclass attrdictcursor (aiomysql. Dictcursor): "" "Inherits Aiomysql's Dictionary cursor" "" Dict_type = Attrdictclass Multiprocessmysql (object): "" Process MySQL data with multiple processes and threads "" "Def __init__ (self, workers=2, pool=10, Start=0, end=2000):" "" The parameters of the first paragraph need to follow the requirements change "" Self.host = "192.168.0. "Self.port = 3306 Self.user =" root "Self.password =" root "self.db =" MyDB "self.or igin_table = "Judgment_main_etl" # main self.dest_table = "Laws_finance1" self.s_sql = f "Select UUID, court _idea, Judge_result, Reason, Plt_claim, Dft_rep, CRS_EXM from {self.origin_table} where%s<=id and id<%s; " Self.i_sql = f "insert into {self.dest_table} (UUID, title, reason, keyword) values (%s,%s,%s,%s)" Self.pool = PO OL # Number of threads and MySQL connections self.aionum = self.pool self.step = 2000 # Number of rows pulled from MySQL self.workers = worker S # process Number Self.start = start # MySQL start line number self.end = end # MySQL end line number Self.keyword = [' Illegal operating payment business ', ' net Money laundering ', ' pool of funds ', ' payment license ', ' Clean count ', ' online payment ', ' online payment ', ' mobile payment ', ' aggregated payment ', ' Capital protected guaranteed ', ' secured transaction ', ' Supply chain finance ', ' Net loan ', ' network borrowing ', ' network investment ', ' false mark ', ' self-fusing ', ' Pool of funds ', ' affiliate transactions ', ' Ponzi scheme ', ' Cyber Finance ', ' online investment ', ' internet private ', ' Internet equity ', ' illegal fundraising ', ' contract fraud ', ' crowdfunding ', ' equity transfer ', ' Internet creditor's rights ', ' capital self-financing ', ' investment scam ', ' money laundering ' ', ' illegal fundraising ', ' Network MLM ', ' Virtual currency bubbles ', ' Network mutual finance ', ' financial fraud ', ' online Banking ', ' credit card theft ', ' phishing ', ' credit card information theft ', ' online money laundering ', ' money laundering scams ', ' digital signature changes ', ' Pay Order theft ', ' financial scam ' Cheat ', ' luring investment ', ' concealing project information ', ' Risk disclosure ', ' exaggerated earnings ', ' fraud insurance ', ' illegal insurance business ', ' embezzlement of client funds ', ' credit reporting theft ', ' financial fraud ', ' destruction of financial management ' SELF.KP = KEYWORDP Rocessor () # Flashtext is a text matching package that is much faster than re self.kp.add_keywords_from_list (Self.keyword) when the number of keywords is large.Async def createmysqlpool (self, Loop): "" "Each process needs to have a separate pool, so do not bind to" "" Pool = await aiomysql.create_pool ( Loop=loop, Host=self.host, Port=self.port, User=self.user, Password=self.password, Db=self.db, maxsize=s Elf.pool, charset= ' UTF8 ', cursorclass=attrdictcursor) return pool def cutrange (self, start, E nd, times): "" "Data Interval segment" "" Partition = (End-start)//times ranges = [] Tmp_end = Start While Tmp_end < end:tmp_end + = partition # remaining insufficient to divide if (end-tmp_end) < Partiti On:tmp_end = End Ranges.append ((Start, tmp_end)) start = Tmp_end return range S async def findkeyword (self, DB, start, end): "" "matches the keyword from mysql data" "" # random rest for a certain time, prevent the data to arrive simultaneously, while processing, should be part of waiting, a Partially processed await Asyncio.sleep (Random.random () * self.workers * 2) print ("coroutine start") Async with DB.A Cquire () as Conn: Async with Conn.cursor () as Cur:while start < End:tmp_end = start + self.step If tmp_end > end:tmp_end = end print ("Aio start:%s, end: %s "% (start, tmp_end)) # <=id and id< await Cur.execute (Self.s_sql, (Start, TM p_end)) Datas = await cur.fetchall () UUIDs = [] for data in datas : If Data:for key in list (Data.keys ()): If not Data[key]: data.pop (key) keyword = Self.kp.extract_ke Ywords ("". Join (Data.values ())) if keyword: Keyword = ". Join (set (keyword)) # to the Keyword de-weight # print (keyword) Uuids.appenD ((Data.uuid, data.title, Data.reason, keyword) await cur.executem Any (Self.i_sql, UUIDs) await conn.commit () start = Tmp_end def singleprocess (self , start, end): "" "The task of a single process" "Loop = Asyncio.get_event_loop () # Creates a pool db for each process = Loop.run_unt Il_complete (Asyncio.ensure_future (Self.createmysqlpool (loop))) tasks = [] ranges = Self.cutrange (Start, end, Self.aionum) print (ranges) for start, end in Ranges:tasks.append (Self.findkeyword (d b, start, end)) Loop.run_until_complete (Asyncio.gather (*tasks)) def run (self): "" "multi-process Run" "" Tasks = [] ranges = Self.cutrange (Self.start, Self.end, self.workers) start_time = Time.time () with Pool (max_ Workers=self.workers) as Executor:for start, end in Ranges:print ("Processor start:%s, end:%s "% (start, end)) Tasks.append (Executor.submit (self.singleprocess, start, end))) for task in TASKS:TASK.R Esult () print ("Total time:%s"% (Time.time ()-start_time)) @click. Command (help= "Run") @click. Option ("-W", "--workers ", default=2, help=" number of Processes ") @click. Option ('-P ',"--pool ", default=10, help=" Number of Threads ") @click. Option ('-s ', '--start ', default= 0, help= ' mysql start id ') @click. Option ('-e ', '--end ', default=2640000, help= "MySQL End ID") def main (workers, pool, start, End): MP = Multiprocessmysql (workers=workers, Pool=pool, Start=start, end=end) If workers * Pool > 100:if Not click.confirm (' MySQL connection number exceeds (%s) ', confirm? '% (workers * Pool): return Mp.run () if __name__ = = "__main__": Main ()
Run as follows:
$ python3 async_mysql.py -w 2 # 可以指定其他参数,也可使用默认值
Personal blog
Process MySQL data with Python3 multi-process and co-routines