From 6264147bdf041fd9aaca4fae73c7ca00b81507ec Mon Sep 17 00:00:00 2001 From: Anshuman Suri Date: Thu, 1 Mar 2018 03:14:37 +0530 Subject: [PATCH 1/2] Adding a CNN-based classifier for the task of entries/other classification --- classification/.gitignore | 1 + classification/README.md | 12 +++++++++++ classification/model.py | 29 +++++++++++++++++++++++++ classification/readData.py | 35 +++++++++++++++++++++++++++++++ classification/trainClassifier.py | 19 +++++++++++++++++ 5 files changed, 96 insertions(+) create mode 100644 classification/.gitignore create mode 100644 classification/README.md create mode 100644 classification/model.py create mode 100644 classification/readData.py create mode 100644 classification/trainClassifier.py diff --git a/classification/.gitignore b/classification/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/classification/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/classification/README.md b/classification/README.md new file mode 100644 index 0000000..8d3861c --- /dev/null +++ b/classification/README.md @@ -0,0 +1,12 @@ +## Classification of images into entry/other + +### Proposed Technique +* Convert all images into Black&White. +* Downsize all images into (150, 250) +* Define a simple CNN-classifier and train it on the given data +* Batch-normalization is used to handle the variance in given data, while automatic class-weights are used to balance the error function (as the class distribution is biased) +* To account for the low amount of data given, a small learning rate is used (to avoid overfitting) + +### Running it +* Run `python trainClassifier.py ` from the current directory to train an end-to-end model. +* For example, run `python trainClassifier.py images/freecen/ data/gold/combined_classifications_20180227.csv` diff --git a/classification/model.py b/classification/model.py new file mode 100644 index 0000000..703372c --- /dev/null +++ b/classification/model.py @@ -0,0 +1,29 @@ +import keras +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten, Activation +from keras.layers import Conv2D, MaxPooling2D, BatchNormalization + +# Define a simple CNN model +def getSimpleCNN(input_shape, num_classes): + model = Sequential() + model.add(Conv2D(16, kernel_size=(3, 3), input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(32, (3, 3))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(Dropout(0.25)) + model.add(Flatten()) + model.add(Dense(64)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(num_classes, activation='softmax')) + + model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adadelta(lr=0.1), + metrics=['accuracy']) + + return model + diff --git a/classification/readData.py b/classification/readData.py new file mode 100644 index 0000000..1e4c5b2 --- /dev/null +++ b/classification/readData.py @@ -0,0 +1,35 @@ +import numpy as np +from PIL import Image +import os +from tqdm import tqdm +from scipy.misc import imresize +import csv + +# Read label classification file, construct data +def getData(imageDirPrefix, filePath): + X = [] + Y = [] + with open(filePath, 'r') as f: + reader = csv.reader(f) + for line in tqdm(reader): + filePath = line[0] + imgClass = line[1] + # Read image as a black&white image + image = np.asarray(Image.open(os.path.join(imageDirPrefix, filePath)).convert('L')) + # Resize into a smaller image + image = imresize(image, (150, 250)) + X.append(image) + Y.append(imgClass) + X = np.array(X) + X = X.reshape(X.shape + (1,)) + # Also store the mapping between class-names and indices + mappingDict = dict([(y,x) for x,y in enumerate(sorted(set(Y)))]) + Y = np.array([ mappingDict[x] for x in Y]) + return X, Y, mappingDict + + +if __name__ == "__main__": + import sys + X, Y, mapping = getData(sys.argv[1], sys.argv[2]) + print X.shape, Y.shape + diff --git a/classification/trainClassifier.py b/classification/trainClassifier.py new file mode 100644 index 0000000..b7c9954 --- /dev/null +++ b/classification/trainClassifier.py @@ -0,0 +1,19 @@ +import readData +import model +import keras + + +if __name__ == "__main__": + import sys + # Load data + X, Y, mapping = readData.getData(sys.argv[1], sys.argv[2]) + num_classes = len(mapping.keys()) + input_shape = X.shape[1:] + # Loada simple CNN for tha classification task + model = model.getSimpleCNN(input_shape, num_classes) + Y = keras.utils.to_categorical(Y, num_classes) + batch_size = 8 + epochs = 20 + # Train our model on the available data + model.fit(X, Y, batch_size=batch_size, epochs=epochs, validation_split=0.2, class_weight='auto') + From 1126fecf4a6ead64ba7e0d77e39e0ba65efee4d7 Mon Sep 17 00:00:00 2001 From: Anshuman Suri Date: Fri, 2 Mar 2018 21:52:48 +0530 Subject: [PATCH 2/2] Fixingmixed indentation/space issue --- classification/model.py | 41 ++++++++++++------------- classification/readData.py | 50 ++++++++++++++++--------------- classification/trainClassifier.py | 26 ++++++++-------- 3 files changed, 60 insertions(+), 57 deletions(-) diff --git a/classification/model.py b/classification/model.py index 703372c..fe1c3a2 100644 --- a/classification/model.py +++ b/classification/model.py @@ -4,26 +4,27 @@ from keras.layers import Conv2D, MaxPooling2D, BatchNormalization # Define a simple CNN model -def getSimpleCNN(input_shape, num_classes): - model = Sequential() - model.add(Conv2D(16, kernel_size=(3, 3), input_shape=input_shape)) - model.add(BatchNormalization()) - model.add(Activation('relu')) - model.add(Conv2D(32, (3, 3))) - model.add(BatchNormalization()) - model.add(Activation('relu')) - model.add(MaxPooling2D(pool_size=(2, 2))) - model.add(Dropout(0.25)) - model.add(Flatten()) - model.add(Dense(64)) - model.add(BatchNormalization()) - model.add(Activation('relu')) - model.add(Dropout(0.5)) - model.add(Dense(num_classes, activation='softmax')) - model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adadelta(lr=0.1), - metrics=['accuracy']) - return model +def getSimpleCNN(input_shape, num_classes): + model = Sequential() + model.add(Conv2D(16, kernel_size=(3, 3), input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(32, (3, 3))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(Dropout(0.25)) + model.add(Flatten()) + model.add(Dense(64)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(num_classes, activation='softmax')) + + model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adadelta(lr=0.1), + metrics=['accuracy']) + return model diff --git a/classification/readData.py b/classification/readData.py index 1e4c5b2..73f21ed 100644 --- a/classification/readData.py +++ b/classification/readData.py @@ -1,4 +1,4 @@ -import numpy as np +import numpy as np from PIL import Image import os from tqdm import tqdm @@ -6,30 +6,32 @@ import csv # Read label classification file, construct data + + def getData(imageDirPrefix, filePath): - X = [] - Y = [] - with open(filePath, 'r') as f: - reader = csv.reader(f) - for line in tqdm(reader): - filePath = line[0] - imgClass = line[1] - # Read image as a black&white image - image = np.asarray(Image.open(os.path.join(imageDirPrefix, filePath)).convert('L')) - # Resize into a smaller image - image = imresize(image, (150, 250)) - X.append(image) - Y.append(imgClass) - X = np.array(X) - X = X.reshape(X.shape + (1,)) - # Also store the mapping between class-names and indices - mappingDict = dict([(y,x) for x,y in enumerate(sorted(set(Y)))]) - Y = np.array([ mappingDict[x] for x in Y]) - return X, Y, mappingDict + X = [] + Y = [] + with open(filePath, 'r') as f: + reader = csv.reader(f) + for line in tqdm(reader): + filePath = line[0] + imgClass = line[1] + # Read image as a black&white image + image = np.asarray( + Image.open(os.path.join(imageDirPrefix, filePath)).convert('L')) + # Resize into a smaller image + image = imresize(image, (150, 250)) + X.append(image) + Y.append(imgClass) + X = np.array(X) + X = X.reshape(X.shape + (1,)) + # Also store the mapping between class-names and indices + mappingDict = dict([(y, x) for x, y in enumerate(sorted(set(Y)))]) + Y = np.array([mappingDict[x] for x in Y]) + return X, Y, mappingDict if __name__ == "__main__": - import sys - X, Y, mapping = getData(sys.argv[1], sys.argv[2]) - print X.shape, Y.shape - + import sys + X, Y, mapping = getData(sys.argv[1], sys.argv[2]) + print X.shape, Y.shape diff --git a/classification/trainClassifier.py b/classification/trainClassifier.py index b7c9954..2e0c31b 100644 --- a/classification/trainClassifier.py +++ b/classification/trainClassifier.py @@ -4,16 +4,16 @@ if __name__ == "__main__": - import sys - # Load data - X, Y, mapping = readData.getData(sys.argv[1], sys.argv[2]) - num_classes = len(mapping.keys()) - input_shape = X.shape[1:] - # Loada simple CNN for tha classification task - model = model.getSimpleCNN(input_shape, num_classes) - Y = keras.utils.to_categorical(Y, num_classes) - batch_size = 8 - epochs = 20 - # Train our model on the available data - model.fit(X, Y, batch_size=batch_size, epochs=epochs, validation_split=0.2, class_weight='auto') - + import sys + # Load data + X, Y, mapping = readData.getData(sys.argv[1], sys.argv[2]) + num_classes = len(mapping.keys()) + input_shape = X.shape[1:] + # Loada simple CNN for tha classification task + model = model.getSimpleCNN(input_shape, num_classes) + Y = keras.utils.to_categorical(Y, num_classes) + batch_size = 8 + epochs = 20 + # Train our model on the available data + model.fit(X, Y, batch_size=batch_size, epochs=epochs, + validation_split=0.2, class_weight='auto')