Http://www.csie.ntu.edu.tw /~ R95007/thesis/svdnetflix/report/report.pdf
Http://eecs.wsu.edu /~ Vjakkula/mlproject.pdf
Http://michielvanwezel.com/papers/kagie_vdloos_vwezelV2.pdf
Http://cseweb.ucsd.edu/users/elkan/KddNetflixWorkshop.pdf
Http://www.cs.uic.edu /~ Liub/KDD-cup-2007/Proceedings/The-Netflix-Prize-Bennett.pdf
Prepare a dataset
1 shell combines all test dataset files into one file
#! /Bin/bash
For X in Netflix/training_set/mv _ *. txt;
Do cat $ x> ratings.txt;
Done &
Http://www.netflixprize.com/community/viewtopic.php? Id = 87
You need to download the path Module
#! /Usr/bin/ENV Python
Import sys
Import CSV
From path import path
Null = '\ N'
Class dialect (CSV. Excel ):
Delimiter = '\ t'
Lineterminator = '\ N'
Doublequote = false
Escapechar = none
Quoting = CSV. quote_minimal
Def csvdump (iter_rows_func, basename, Dir = '.', csvdir = 'csv', dialect = dialect ):
Dir, csvdir = path (DIR), path (csvdir)
If not csvdir. exists ():
Csvdir. mkdir ()
Inpath = DIR/basename
OUTFILE = csvdir/inpath. namebase + '.csv'
If not OUTFILE. exists ():
Write = CSV. Writer (open (OUTFILE, 'wb '), dialect). writerow
Print> SYS. stderr, 'writing % s... '% OUTFILE
For row in iter_rows_func (inpath ):
Write (ROW)
Def itermovierows (PATH ):
For line in open (PATH ):
ID, year, Title = line. rstrip (). Split (',', 2)
Year = year! = 'Null' and INT (year) or null
Yield (INT (ID), Year, title)
def itertrainingsetrows (DIR):
for Path in Dir. program Files ():
iterlines = (line. strip () for line in open (PATH ))
movie_id = int (iterlines. next () [:-1])
for line in iterlines:
user_id, rating, date = line. split (',', 2)
yield (movie_id, INT (user_id), date, float (rating)
def iterprobesetrows (PATH):
for line in (line. strip () for line in open (PATH):
try:
user_id = int (line)
comment t valueerror:
movie_id = int (line [: -1])
else:
yield (movie_id, user_id)
Def iterqualifyingsetrows (PATH ):
For line in (line. Strip () for line in open (PATH )):
Try:
User_id, date = line. Split (',')
Failed t valueerror:
Movie_id = int (line [:-1])
Else:
Yield (movie_id, user_id, date)
If _ name _ = '_ main __':
Kwds = {}
If Len (SYS. argv)> 1:
Kwds ['dir'] = SYS. argv [1]
If Len (SYS. argv)> 2:
Kwds ['csvdir'] = SYS. argv [2]
For iterfunc, basename in [
(Itermovierows, 'Movie_titles.txt '),
(Itertrainingsetrows, 'Training _ set '),
(Iterprobesetrows, 'Probe.txt '),
(Iterqualifyingsetrows, 'qualifying.txt ')]:
Csvdump (iterfunc, basename, ** kwds)
Perl script
#! /Usr/bin/perl
Use strict;
My $ dir = '/path/to/Your/training_set ';
Opendir Dir, $ DIR or die ("cocould not open $ dir ");
While (my $ fname = readdir DIR ){
My $ fname = "$ DIR/$ fname ";
Open File, $ fname or die ("cocould not open $ fname ");
(My $ mid = <File>) = ~ S/:. * // s;
While (<File> ){
Chomp;
Print QQ ("$ mid ",);
Map {print QQ ("$ _",)} split /,/;
Print "\ n ";
}
Close file;
}
Closedir dir;
Exit;
$ Time./bigcsv. pl> bigcsv.csv
Real 35m11. 521 s
User 10m36. 272 s
Sys 4m9. 940 s
Mysql> load data infile 'bigcsv.csv 'into Table main fields terminated by', 'enabled by '"'Lines terminated by' \ n ';
Query OK, 100480507 rows affected (5 min 34.39 Sec)
Redis: 100480507Deleted: 0Skipped: 0Warnings: 0