The project encountered such a requirement:
There is a record file with one item per line. The size of the entire file is about 2 GB. According to requirements, we need to provide 100000 items to other systems every day. How can this problem be solved?
After consideration, I think the idea of sharding is good. First, we need to partition this large file based on the number of items of each piece, and then we can place one piece each day.
PythonCodeAs follows:
View code
# -*-Coding: UTF-8 -*-
Import OS
Import Sys
Import Shutil
Import Time
# Import linecache
Import Hashlib
Import Zlib
Import Binascii
Import Urllib2
Import Logging
Datas_dir = " ./Datas/ "
Items_per_page = 10000
Url_prefix = " Http: // 172.16.1.110: 80/download/ "
Check_result_dir = " ./Results/ "
Logger = logging. getlogger ( _ Name __ )
Def Initialize ():
"""
@ Summary: Initialize the working directory
"""
If OS. Path. exists (datas_dir)And OS. Path. isdir (datas_dir ):
# Clear datas dir
Print " Begin to remove old datas directory "
Shutil. rmtree (datas_dir)
Print " Begin to make datas directory "
# To resove the conflict between rmtree and mkdir, so I will sleep 1 seconds
Time. Sleep (1)
OS. mkdir (datas_dir)
Def Read_specific_lines (file, lines_to_read ):
"""
@ Summary: read specific lines from File
File is any iterable; lines_to_read is an iterable
Containing int values
"""
Lines = set (lines_to_read)
Last = max (lines)
For N, lineIn Enumerate (File ):
If N + 1 In Lines:
Yield Line
If N + 1> last:
Return
Def Split_file (filename, lines_per_page ):
"""
@ Summary: Split the file into n lines a page
"""
If Lines_per_page <= 0:
Lines_per_page = 1
With open (filename, ' R ' ) As FP:
Lines = []
For N, line In Enumerate (FP ):
Guard = n % lines_per_page
If Guard = 0:
Lines = []
Lines. append (line)
If Guard = lines_per_page-1:
Yield Lines
Yield Lines
Def Write_to_file (lines, filename ):
"""
@ Summary: Write lines to specified file
"""
With open (filename, ' W ' ) As FP:
For Line In Lines:
# Construct content
Line_to_write = url_prefix + LINE
FP. Write (line_to_write)
Def Calculate_md5_crc32 (MSG ):
"""
@ Summary: Calculate the MD5 and CRC32
"""
M = hashlib. MD5 ()
M. Update (MSG)
MD5 = M. hexdigest (). Upper ()
CRC32 = binascii. CRC32 (MSG)
CRC32 = CRC32 & 0 xffffffff
Crc32_str = " % 08x " % CRC32
Crc32_str = crc32_str.upper ()
Return MD5 + ' . ' + Crc32_str
Def Check_file_integrity (download_url ):
"""
@ Summary: Download file and check it's integrity
@ Return: True/false
"""
Try :
File_name = download_url.rsplit ( " / " , 1) [1]
Response = urllib2.urlopen (download_url)
Md5_crc32 = calculate_md5_crc32 (response. Read ())
Print " File_name = % s, md5_crc32 = % s " % (File_name, md5_crc32)
If File_name = md5_crc32:
Return True
Else :
Return False
Except Exception, EX:
Logger. Exception (Ex)
Return False
Def Do_check ():
If OS. Path. exists (check_result_dir) And OS. Path. isdir (check_result_dir ):
# Clear datas dir
Print " Begin to remove old result directory "
Shutil. rmtree (check_result_dir)
Print " Begin to make result directory "
# To resove the conflict between rmtree and mkdir, so I will sleep 1 seconds
Time. Sleep (1)
OS. mkdir (check_result_dir)
# Fp = open ("not_integrity.list", 'w ')
For N, lines In Enumerate (split_file ( " Alive_sample.log " , Items_per_page )):
Print " Begin to check % d sample list " % (N + 1)
If N> = 1:
Break
Filename = OS. Path. Join (check_result_dir, " Notintergrity _ " + STR (n + 1) + " . List " )
Fp = open (filename, ' W ' )
For Line In Lines:
Try :
Download_url = url_prefix + LINE. Strip ()
Res = check_file_integrity (download_url)
If Res = false:
FP. Write (line)
FP. Flush ()
Logger. Error ( " Check integrity error, download_url = % s " , Download_url)
Else :
Print " % S check OK " % Line
Except Exception, EX:
Logger. Exception (Ex)
FP. Close ()
FP. Close ()
If _ Name __ = " _ Main __ " :
Import Myloggingconfig
# Do_check ()
# Assert false
Print Check_file_integrity ( " Http: // 172.16.1.110: 80/download/b4d2ef861106f6812668d5163ea9cd58.4f38c168 " )
Assert False
Initialize ()
For N, lines In Enumerate (split_file ( " 20120106. rpt " , Items_per_page )):
Print " Begin construct % d sample list " % (N + 1)
# # If n> 4:
# # Break
# Construct file name
Filename = OS. Path. Join (datas_dir," Samplelist _ " + STR (n + 1) + " . List " )
Write_to_file (lines, filename)
The above Code contains the MD5 and CRC32 computing tools. The entire sharding function is included in the split_file function.
def split_file (filename, lines_per_page):
"
@ summary: split the file into n lines a page
"
If lines_per_page <= 0:
lines_per_page = 1
with open (filename, ' r ') as FP:
lines = []
for N, line in enumerate (FP ):
Guard = n % lines_per_page
If Guard = 0:
lines = []
lines. append (line)
If Guard = lines_per_page-1:
yield lines
yield lines