A simple Python program that fragment large files based on the number of lines

Source: Internet
Author: User

The project encountered such a requirement:
There is a record file with one item per line. The size of the entire file is about 2 GB. According to requirements, we need to provide 100000 items to other systems every day. How can this problem be solved?
After consideration, I think the idea of sharding is good. First, we need to partition this large file based on the number of items of each piece, and then we can place one piece each day.
PythonCodeAs follows:

View code

 #  -*-Coding: UTF-8 -*-  

Import OS
Import Sys
Import Shutil
Import Time
# Import linecache
Import Hashlib
Import Zlib
Import Binascii
Import Urllib2

Import Logging


Datas_dir = " ./Datas/ "
Items_per_page = 10000
Url_prefix = " Http: // 172.16.1.110: 80/download/ "
Check_result_dir = " ./Results/ "

Logger = logging. getlogger ( _ Name __ )

Def Initialize ():
"""
@ Summary: Initialize the working directory
"""
If OS. Path. exists (datas_dir)And OS. Path. isdir (datas_dir ):
# Clear datas dir
Print " Begin to remove old datas directory "
Shutil. rmtree (datas_dir)
Print " Begin to make datas directory "
# To resove the conflict between rmtree and mkdir, so I will sleep 1 seconds
Time. Sleep (1)
OS. mkdir (datas_dir)


Def Read_specific_lines (file, lines_to_read ):
"""
@ Summary: read specific lines from File
File is any iterable; lines_to_read is an iterable
Containing int values
"""
Lines = set (lines_to_read)
Last = max (lines)
For N, lineIn Enumerate (File ):
If N + 1 In Lines:
Yield Line
If N + 1> last:
Return
Def Split_file (filename, lines_per_page ):
"""
@ Summary: Split the file into n lines a page
"""
If Lines_per_page <= 0:
Lines_per_page = 1

With open (filename, ' R ' ) As FP:
Lines = []
For N, line In Enumerate (FP ):
Guard = n % lines_per_page
If Guard = 0:
Lines = []
Lines. append (line)
If Guard = lines_per_page-1:
Yield Lines
Yield Lines

Def Write_to_file (lines, filename ):
"""
@ Summary: Write lines to specified file
"""
With open (filename, ' W ' ) As FP:
For Line In Lines:
# Construct content
Line_to_write = url_prefix + LINE
FP. Write (line_to_write)

Def Calculate_md5_crc32 (MSG ):
"""
@ Summary: Calculate the MD5 and CRC32
"""
M = hashlib. MD5 ()
M. Update (MSG)
MD5 = M. hexdigest (). Upper ()
CRC32 = binascii. CRC32 (MSG)
CRC32 = CRC32 & 0 xffffffff
Crc32_str = " % 08x " % CRC32
Crc32_str = crc32_str.upper ()

Return MD5 + ' . ' + Crc32_str

Def Check_file_integrity (download_url ):
"""
@ Summary: Download file and check it's integrity
@ Return: True/false
"""
Try :
File_name = download_url.rsplit ( " / " , 1) [1]
Response = urllib2.urlopen (download_url)
Md5_crc32 = calculate_md5_crc32 (response. Read ())
Print " File_name = % s, md5_crc32 = % s " % (File_name, md5_crc32)
If File_name = md5_crc32:
Return True
Else :
Return False
Except Exception, EX:
Logger. Exception (Ex)
Return False

Def Do_check ():
If OS. Path. exists (check_result_dir) And OS. Path. isdir (check_result_dir ):
# Clear datas dir
Print " Begin to remove old result directory "
Shutil. rmtree (check_result_dir)
Print " Begin to make result directory "
# To resove the conflict between rmtree and mkdir, so I will sleep 1 seconds
Time. Sleep (1)
OS. mkdir (check_result_dir)
# Fp = open ("not_integrity.list", 'w ')

For N, lines In Enumerate (split_file ( " Alive_sample.log " , Items_per_page )):
Print " Begin to check % d sample list " % (N + 1)
If N> = 1:
Break
Filename = OS. Path. Join (check_result_dir, " Notintergrity _ " + STR (n + 1) + " . List " )
Fp = open (filename, ' W ' )
For Line In Lines:
Try :
Download_url = url_prefix + LINE. Strip ()
Res = check_file_integrity (download_url)
If Res = false:
FP. Write (line)
FP. Flush ()
Logger. Error ( " Check integrity error, download_url = % s " , Download_url)
Else :
Print " % S check OK " % Line
Except Exception, EX:
Logger. Exception (Ex)
FP. Close ()
FP. Close ()
If _ Name __ = " _ Main __ " :
Import Myloggingconfig
# Do_check ()
# Assert false
Print Check_file_integrity ( " Http: // 172.16.1.110: 80/download/b4d2ef861106f6812668d5163ea9cd58.4f38c168 " )
Assert False
Initialize ()
For N, lines In Enumerate (split_file ( " 20120106. rpt " , Items_per_page )):
Print " Begin construct % d sample list " % (N + 1)
# # If n> 4:
# # Break
# Construct file name
Filename = OS. Path. Join (datas_dir," Samplelist _ " + STR (n + 1) + " . List " )
Write_to_file (lines, filename)

The above Code contains the MD5 and CRC32 computing tools. The entire sharding function is included in the split_file function.

  def  split_file (filename, lines_per_page): 
"
@ summary: split the file into n lines a page
"
If lines_per_page <= 0:
lines_per_page = 1
with open (filename, ' r ') as FP:
lines = []
for N, line in enumerate (FP ):
Guard = n % lines_per_page
If Guard = 0:
lines = []
lines. append (line)
If Guard = lines_per_page-1:
yield lines
yield lines

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.