Process MySQL data with Python3 multi-process and co-routines

Source: Internet
Author: User

This article describes the multi-process + python3 processing MySQL data, the main logic is to pull the MySQL data, and then use the Flashtext match keyword, in the storage back to MySQL, the code is as follows ( async_mysql.py):

Import timeimport asyncioimport randomfrom concurrent.futures import processpoolexecutor as Poolimport Aiomysqlfrom Flashtext Import keywordprocessorimport clickclass attrdict (dict): "" "can be used". "            Gets the Dictionary "" "Def __getattr__ (self, name): Try:return Self[name] except keyerror when the attribute is not returned without this property: Return None def __setattr__ (self, Name, value): self[name] = Valueclass attrdictcursor (aiomysql. Dictcursor): "" "Inherits Aiomysql's Dictionary cursor" "" Dict_type = Attrdictclass Multiprocessmysql (object): "" Process MySQL data with multiple processes and threads "" "Def __init__ (self, workers=2, pool=10, Start=0, end=2000):" "" The parameters of the first paragraph need to follow the requirements change "" Self.host = "192.168.0. "Self.port = 3306 Self.user =" root "Self.password =" root "self.db =" MyDB "self.or igin_table = "Judgment_main_etl" # main self.dest_table = "Laws_finance1" self.s_sql = f "Select UUID, court _idea, Judge_result, Reason, Plt_claim, Dft_rep, CRS_EXM from {self.origin_table} where%s<=id and id<%s; " Self.i_sql = f "insert into {self.dest_table} (UUID, title, reason, keyword) values (%s,%s,%s,%s)" Self.pool = PO OL # Number of threads and MySQL connections self.aionum = self.pool self.step = 2000 # Number of rows pulled from MySQL self.workers = worker S # process Number Self.start = start # MySQL start line number self.end = end # MySQL end line number Self.keyword = [' Illegal operating payment business ', ' net Money laundering ', ' pool of funds ', ' payment license ', ' Clean count ', ' online payment ', ' online payment ', ' mobile payment ', ' aggregated payment ', ' Capital protected guaranteed ', ' secured transaction ', ' Supply chain finance ', ' Net loan ', ' network borrowing ', ' network investment ', ' false mark ', ' self-fusing ', ' Pool of funds ', ' affiliate transactions ', ' Ponzi scheme ', ' Cyber Finance ', ' online investment ', ' internet private ', ' Internet equity ', ' illegal fundraising ', ' contract fraud ', ' crowdfunding ', ' equity transfer ', ' Internet creditor's rights ', ' capital self-financing ', ' investment scam ', ' money laundering ' ', ' illegal fundraising ', ' Network MLM ', ' Virtual currency bubbles ', ' Network mutual finance ', ' financial fraud ', ' online Banking ', ' credit card theft ', ' phishing ', ' credit card information theft ', ' online money laundering ', ' money laundering scams ', ' digital signature changes ', ' Pay Order theft ', ' financial scam ' Cheat ', ' luring investment ', ' concealing project information ', ' Risk disclosure ', ' exaggerated earnings ', ' fraud insurance ', ' illegal insurance business ', ' embezzlement of client funds ', ' credit reporting theft ', ' financial fraud ', ' destruction of financial management ' SELF.KP = KEYWORDP    Rocessor () # Flashtext is a text matching package that is much faster than re self.kp.add_keywords_from_list (Self.keyword) when the number of keywords is large.Async def createmysqlpool (self, Loop): "" "Each process needs to have a separate pool, so do not bind to" "" Pool = await aiomysql.create_pool ( Loop=loop, Host=self.host, Port=self.port, User=self.user, Password=self.password, Db=self.db, maxsize=s Elf.pool, charset= ' UTF8 ', cursorclass=attrdictcursor) return pool def cutrange (self, start, E        nd, times): "" "Data Interval segment" "" Partition = (End-start)//times ranges = [] Tmp_end = Start While Tmp_end < end:tmp_end + = partition # remaining insufficient to divide if (end-tmp_end) < Partiti On:tmp_end = End Ranges.append ((Start, tmp_end)) start = Tmp_end return range S async def findkeyword (self, DB, start, end): "" "matches the keyword from mysql data" "" # random rest for a certain time, prevent the data to arrive simultaneously, while processing, should be part of waiting, a Partially processed await Asyncio.sleep (Random.random () * self.workers * 2) print ("coroutine start") Async with DB.A       Cquire () as Conn:     Async with Conn.cursor () as Cur:while start < End:tmp_end = start + self.step  If tmp_end > end:tmp_end = end print ("Aio start:%s, end: %s "% (start, tmp_end)) # <=id and id< await Cur.execute (Self.s_sql, (Start, TM p_end)) Datas = await cur.fetchall () UUIDs = [] for data in datas                                : If Data:for key in list (Data.keys ()): If not Data[key]: data.pop (key) keyword = Self.kp.extract_ke                                Ywords ("". Join (Data.values ())) if keyword:                                Keyword = ". Join (set (keyword)) # to the Keyword de-weight # print (keyword) Uuids.appenD ((Data.uuid, data.title, Data.reason, keyword) await cur.executem Any (Self.i_sql, UUIDs) await conn.commit () start = Tmp_end def singleprocess (self , start, end): "" "The task of a single process" "Loop = Asyncio.get_event_loop () # Creates a pool db for each process = Loop.run_unt Il_complete (Asyncio.ensure_future (Self.createmysqlpool (loop))) tasks = [] ranges = Self.cutrange (Start, end, Self.aionum) print (ranges) for start, end in Ranges:tasks.append (Self.findkeyword (d b, start, end)) Loop.run_until_complete (Asyncio.gather (*tasks)) def run (self): "" "multi-process Run" "" Tasks = [] ranges = Self.cutrange (Self.start, Self.end, self.workers) start_time = Time.time () with Pool (max_ Workers=self.workers) as Executor:for start, end in Ranges:print ("Processor start:%s, end:%s       "% (start, end))         Tasks.append (Executor.submit (self.singleprocess, start, end))) for task in TASKS:TASK.R Esult () print ("Total time:%s"% (Time.time ()-start_time)) @click. Command (help= "Run") @click. Option ("-W", "--workers ", default=2, help=" number of Processes ") @click. Option ('-P ',"--pool ", default=10, help=" Number of Threads ") @click. Option ('-s ', '--start ', default= 0, help= ' mysql start id ') @click. Option ('-e ', '--end ', default=2640000, help= "MySQL End ID") def main (workers, pool, start,  End): MP = Multiprocessmysql (workers=workers, Pool=pool, Start=start, end=end) If workers * Pool > 100:if    Not click.confirm (' MySQL connection number exceeds (%s) ', confirm? '% (workers * Pool): return Mp.run () if __name__ = = "__main__": Main ()

Run as follows:
$ python3 async_mysql.py -w 2 # 可以指定其他参数,也可使用默认值

Personal blog

Process MySQL data with Python3 multi-process and co-routines

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.