Another piece of hydrology.
The first is to read the file name tag, I'm here to be a simple cat and dog identification dataset
def _find_image_files (Data_dir, labels_file):
jpeg_file_path = '%s/*.jpg '% (data_dir) #文件路径
matching_files = Tf.gfile.Glob (Jpeg_file_path) #搜寻这个路径下的所有指定格式文件, here is *.jpg, that is, all jpg files
labels = [0 if ' cat ' in Os.path.basename (file) Else 1 for file in Matching_files] #根据标签的格式截取cat 0.jpg ..., so we determine if the picture name contains a cat or dog
c = List (Zip (matching_files, labels)) # The filename and corresponding label are combined with
Shuffle (c) #将其打乱
filenames, labels = zip (*c) #解压出来后就是打乱后的数据了 return
filenames, labels
Then specify the data format that needs to be saved, that is example, this is the TensorFlow specified data structure, I am here to save only the data and labels of the picture
def _convert_to_example (image_buffer, label):
example = Tf.train.Example (Features=tf.train.features (feature={
' Image/label ': _int64_feature (label),
' image/encoded ': _bytes_feature (Image_buffer)}
) return example
Then is to write the data, we can write all the pictures and tags to a file, can also be divided into several files, the first is the picture and the label into a few parts, and then these pieces of data are written to different tfrecords.
def _process_image_files (output_directory, name, filenames, labels, num_shards): Num_images = Len (filenames) # See how many pictures Slice Num_batch = Np.linspace (0,num_images,num_shards+1). Astype (Np.int) # Dividing the data into n blocks, you can look at Np.linspace and know why you add 1 for cou Nter in Range (num_shards): output_filename = '%s-%.5d-of-%.5d '% (name, counter, num_shards) #要保存的tfrecords文件名, tra in-00001-00008 (first name, written several files, a total of several files) Output_file = Os.path.join (output_directory, Output_filename) start,end = Num_batch[counter], num_batch[counter+1] #获得每一段数据的起始范围 writer = tf.python_io. Tfrecordwriter (Output_file) #写文件描述符 for J in Range (Start,end): #遍历这段区间的所有文件 Filename,label = filenames[ J],LABELS[J] Try:with tf.gfile.FastGFile (filename, ' RB ') as F:image_buffe R = F.read () #读取图片的原始数据 except Exception as E:print (e) Continue E Xample = _convert_to_example (image_buffer, label) #将数据和标签保存成The specified example format writer.write (example. Serializetostring ()) print (' Writing {} picture, the filename is {}, the label is {}, the Shard is {} '. Format (J,filename,la Bel,counter)) Writer.close ()
Integrate
def _process_dataset (output_directory,name, directory, Labels_file, num_shards):
filenames, labels = _find_image_ Files (directory, labels_file)
_process_image_files (output_directory,name, filenames, labels,num_shards)
def main (UNUSED_ARGV):
_process_dataset (' Tfdata ', ' train ', '. /.. /cat_dog/train ', ', 8
# Cat and dog data no specific tag file, so direct for NULL, if there are words to deal with if
__name__ = = ' __main__ ':
tf.app.run ()
The output of the execution process
Writing 24993 picture, the filename is ... \.. \cat_dog\train\dog.10861.jpg, the label is 1, the Shard is 7
writing 24994 the picture, the filename is. \.. \cat_dog\train\dog.7031.jpg, the label is 1, the Shard is 7
writing 24995 the picture, the filename is. \.. \cat_dog\train\cat.7885.jpg, the label is 0, the Shard is 7
writing 24996 the picture, the filename is. \.. \cat_dog\train\dog.8770.jpg, the label is 1, the Shard is 7
writing 24997 the picture, the filename is. \.. \cat_dog\train\dog.6193.jpg, the label is 1, the Shard is 7
writing 24998 the picture, the filename is. \.. \cat_dog\train\cat.11390.jpg, the label is 0, the Shard is 7
writing 24999 the picture, the filename is. \.. \cat_dog\train\cat.5946.jpg, label is 0, Shard is 7
After the file is written we can read it, first of all, to load the written tfrecords file in
DataSet = Tf.data.TFRecordDataset (filenames)
With this dataset we can do whatever we want, such as we would like to disrupt the data can use Dataset.buffle (1024), we want to repeat a few times, in fact, is the cycle of training several times dataset, you can use Dataset.repeat (), and so on ... , of course, before doing this, we should first parse the format of our data example into picture format, and we need Dataset.map (parse_function) function
def _parse_function (Example_proto):
features = {
' Image/label ': TF. Fixedlenfeature ((), Tf.int64, default_value=0),
' image/encoded ': TF. Fixedlenfeature ((), tf.string, default_value= "")
}
parsed = Tf.parse_single_example (Example_proto, features #解析example
label = tf.cast (parsed[' Image/label '], tf.int32)
encoded = tf.image.decode_image (parsed[' image/ Encoded '])
#encoded = tf.image.decode_jpeg (parsed[' image/encoded ']) #这个就不需要加下面的那句了
encoded.set_shape ([ None, none, none] #一定要加这一句, do not believe you try,
encoded = tf.image.resize_images (encoded, (224,224)) #如果不缩放到相同的尺寸, can not batch read, This is not very good, it is better to read their own batch, such as the original multithreaded queue read return
encoded, label
The following is a summary of all the code
#-*-Coding:utf-8-*-#-*-coding:utf-8-*-import tensorflow as TF import six import OS import NumPy as NP from ran Dom Import Shuffle def _int64_feature (value): If not isinstance (value, list): value = [value] return TF.TRAIN.F Eature (Int64_list=tf.train.int64list (value=value)) def _float_feature (value): "" wrapper for inserting float features
Into Example Proto. "" "
If not isinstance (value, list): value = [value] return Tf.train.Feature (Float_list=tf.train.floatlist (Value=value))
def _bytes_feature (value): "" "wrapper for inserting bytes features into Example." "" If Isinstance (value, six.string_types): value = Six.binary_type (value, encoding= ' Utf-8 ') return Tf.train . Feature (Bytes_list=tf.train.byteslist (value=[value)) def _convert_to_example (Image_buffer, label): example = Tf.tra In. Example (features=tf.train.features feature={' Image/label ': _int64_feature (label), ' image/encoded ' : _bytes_feature (imagE_buffer)}) Return Example Def _process_image_files (output_directory, name, filenames, labels, num_shards): num
_images = Len (filenames) Num_batch = Np.linspace (0,num_images,num_shards+1). Astype (np.int) # divides the data into n blocks [0,100,200 ...] For counter in range (num_shards): output_filename = '%s-%.5d-of-%.5d '% (name, counter, num_shards) ou
Tput_file = Os.path.join (output_directory, output_filename) start,end = Num_batch[counter], num_batch[counter+1] writer = Tf.python_io.
Tfrecordwriter (Output_file) for J in Range (start,end): Filename,label = Filenames[j],labels[j]
Try:with tf.gfile.FastGFile (filename, ' RB ') as F:image_buffer = F.read () Except Exception as E:print (e) Continue example = _convert_to_example (im Age_buffer, label) Writer.write (example. Serializetostring ()) print (' Writing {} picture, FilenaMe is {}, the label is {}, the Shard is {} '. Format (j,filename,label,counter)) Writer.close () def _find_image_files (data_d IR, labels_file): Jpeg_file_path = '%s/*.jpg '% (data_dir) matching_files = Tf.gfile.Glob (Jpeg_file_path) Labe
ls = [0 if ' cat ' in Os.path.basename (file) Else 1 for file in matching_files] c = list (Zip (matching_files, labels)) Shuffle (c) filenames, labels = zip (*c) return filenames, Labels def _process_dataset (Output_directory,name, dir Ectory, Labels_file, num_shards): filenames, labels = _find_image_files (directory, labels_file) _process_i Mage_files (Output_directory,name, filenames, Labels,num_shards) def main (UNUSED_ARGV): _process_dataset (' Tfdata ', ' Train ', '. /..
/cat_dog/train ', ', 8 def input_function (filenames): Def _parse_function (Example_proto): Features = { ' Image/label ': TF. Fixedlenfeature ((), Tf.int64, default_value=0), ' image/encoded ': TF. Fixedlenfeature((), tf.string, default_value= "")}
parsed = Tf.parse_single_example (Example_proto, features) label = Tf.cast (parsed[' Image/label '], Tf.int32) encoded = Tf.image.decode_image (parsed[' image/encoded ') encoded.set_shape (none, none, none) encoded = Tf.image.resize_images (encoded, (224,224)) #如果不缩放到相同的尺寸, cannot read return encoded, label DataSet = Tf.data.TFRecor Ddataset (filenames) DataSet = Dataset.map (_parse_function) DataSet = Dataset.repeat () DataSet = Dataset.batch ( iterator = Dataset.make_one_shot_iterator () next_element = Iterator.get_next () return next_element if __n
ame__ = = ' __main__ ': Tf.app.run ()
We can verify that the data is read
#-*-Coding:utf-8-*-
import tensorflow as TF from
build1 import input_function
import Matplotlib.pyplot as PLT
import NumPy as np
filenames = [' tfdata/train-00000-of-00008 ', ' tfdata/train-00001-of-00008 ']
next_ element = Input_function (filenames) with
TF. Session () as Sess:
Img,lab = Sess.run (next_element)
plt.imshow (Img[0].astype (np.int))
print (img[0). SHAPE) while
True:
try:
print (Sess.run (next_element))
except Tf.errors.OutOfRangeError: Break