一個簡單的根據行數對大檔案進行分區的python程式

來源:互聯網
上載者:User

項目中碰到了這麼一個需求:
有一個record檔案,每行一個item,整個檔案大小在2G左右。根據要求,需要每天向其他系統提供100000個item,怎麼處理比較好?
考慮之後覺得分區的主意不錯,先根據每片的item數對這個大檔案進行分區,然後每天投放一片即可。
具體python代碼如下:

View Code

# -*- coding: utf-8 -*-

import os
import sys
import shutil
import time
# import linecache
import hashlib
import zlib
import binascii
import urllib2

import logging


datas_dir = "./datas/"
items_per_page = 10000
url_prefix = "http://172.16.1.110:80/download/"
check_result_dir = "./results/"

logger = logging.getLogger(__name__)

def initialize():
"""
@summary: initialize the working directory
"""
if os.path.exists(datas_dir) and os.path.isdir(datas_dir):
# clear datas dir
print "begin to remove old datas directory"
shutil.rmtree(datas_dir)
print "begin to make datas directory"
# to resove the conflict between rmtree and mkdir, so i will sleep 1 seconds
time.sleep(1)
os.mkdir(datas_dir)


def read_specific_lines(file, lines_to_read):
"""
@summary: read specific lines from file
file is any iterable; lines_to_read is an iterable
containing int values
"""
lines = set(lines_to_read)
last = max(lines)
for n, line in enumerate(file):
if n + 1 in lines:
yield line
if n + 1 > last:
return
def split_file(filename, lines_per_page):
"""
@summary: split the file into n lines a page
"""
if lines_per_page <=0:
lines_per_page = 1

with open(filename, 'r') as fp:
lines = []
for n, line in enumerate(fp):
guard = n % lines_per_page
if guard == 0:
lines = []
lines.append(line)
if guard == lines_per_page - 1:
yield lines
yield lines

def write_to_file(lines, filename):
"""
@summary: write lines to specified file
"""
with open(filename, 'w') as fp:
for line in lines:
# construct content
line_to_write = url_prefix + line
fp.write(line_to_write)

def calculate_md5_crc32(msg):
"""
@summary: calculate the md5 and crc32
"""
m = hashlib.md5()
m.update(msg)
md5 = m.hexdigest().upper()
crc32 = binascii.crc32(msg)
crc32 = crc32 & 0xffffffff
crc32_str = "%08x" % crc32
crc32_str = crc32_str.upper()

return md5 + '.' + crc32_str

def check_file_integrity(download_url):
"""
@summary: download file and check it's integrity
@return: True/False
"""
try:
file_name = download_url.rsplit("/", 1)[1]
response = urllib2.urlopen(download_url)
md5_crc32 = calculate_md5_crc32(response.read())
print "file_name = %s, md5_crc32 = %s" %(file_name, md5_crc32)
if file_name == md5_crc32:
return True
else:
return False
except Exception, ex:
logger.exception(ex)
return False

def do_check():
if os.path.exists(check_result_dir) and os.path.isdir(check_result_dir):
# clear datas dir
print "begin to remove old result directory"
shutil.rmtree(check_result_dir)
print "begin to make result directory"
# to resove the conflict between rmtree and mkdir, so i will sleep 1 seconds
time.sleep(1)
os.mkdir(check_result_dir)
# fp = open("not_integrity.list", 'w')

for n, lines in enumerate(split_file("alive_sample.log", items_per_page)):
print "begin to check %d sample list" %( n+1)
if n >= 1:
break
filename = os.path.join(check_result_dir, "notintergrity_" + str(n + 1) + ".list")
fp = open(filename, 'w')
for line in lines:
try:
download_url = url_prefix + line.strip()
res = check_file_integrity(download_url)
if res == False:
fp.write(line)
fp.flush()
logger.error("check integrity error, download_url = %s", download_url)
else:
print "%s check OK" % line
except Exception, ex:
logger.exception(ex)
fp.close()
fp.close()
if __name__ == "__main__":
import myloggingconfig
#do_check()
#assert False
print check_file_integrity("http://172.16.1.110:80/download/B4D2EF861106F6812668D5163EA9CD58.4F38C168")
assert False
initialize()
for n, lines in enumerate(split_file("20120106.rpt", items_per_page)):
print "begin construct %d sample list" %( n+1)
## if n > 4:
## break
# construct file name
filename = os.path.join(datas_dir, "samplelist_" + str(n + 1) + ".list")
write_to_file(lines, filename)

上述代碼中包含了計算md5和crc32的工具,整個分區功能包含在split_file函數中。

def split_file(filename, lines_per_page):
"""
@summary: split the file into n lines a page
"""
if lines_per_page <=0:
lines_per_page = 1

with open(filename, 'r') as fp:
lines = []
for n, line in enumerate(fp):
guard = n % lines_per_page
if guard == 0:
lines = []
lines.append(line)
if guard == lines_per_page - 1:
yield lines
yield lines
相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.