GenderEthnicityDetector/predict_gender_and_ethnicity.py at master · devssh/GenderEthnicityDetector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

# coding: utf-8

# # Gender Usernames/Names

# In[1]:


import numpy as np
import pandas as pd
#from tqdm import tqdm
#tqdm.pandas()
np.random.seed(7)

import tensorflow as tf
load_model = tf.keras.models.load_model
adam = tf.keras.optimizers.Adam(lr=1e-3)

import string
alphabet_list = list(string.ascii_lowercase)
max_name_len = 20

import re

def string_vectorizer(strng, alphabet, max_str_len=20, gender=True):
    if(gender):
        strng = re.sub(r"[^a-z]+", "", strng.lower())
    else:
        strng = re.sub(r"[^a-zA-z0-9-]+", "", strng)
    vector = [[0 if char != letter else 1 for char in alphabet] for letter in strng[0:max_str_len]]
    while len(vector) != max_str_len:
        vector = [*vector, [0 for char in alphabet]]
    return np.array(vector)


gendermodel = load_model("gendermodel.h5")
gendermodel.load_weights("genderweights.h5")
gendermodel.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
# 90% accuracy on 30k female and 33k male names synthesised from Indian names list
gendermodel.summary()


# In[2]:


names = pd.Series(["RajSood25", "mukta57cute", "burnvipulkumarfire", "priyasubramanium"])
print(names)
names_transform = names.apply(lambda name: string_vectorizer(name, alphabet_list, max_name_len).reshape(1, 20, 26))
names_transform = np.vstack(names_transform.tolist())
prediction = gendermodel.predict(names_transform)
print("array([[male, female]]) probability")
prediction = [[int(pred[0]*100)/100, int(pred[1]*100)/100] for pred in prediction]
print(np.array(prediction))


# # Ethnicity Usernames

# In[4]:


alphabet_listset = pd.read_csv("ethnicity_listset.csv")["characters"].tolist()
print(alphabet_listset[0:5])
ethnicitymodel = load_model("eth_model.h5")

ethnicitymodel.load_weights("eth_weights.h5")
ethnicitymodel.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
# 88% accuracy on Indian vs non indian usernames
ethnicitymodel.summary()


# In[5]:


names = pd.Series(["RajkumarSood", "mukta57", "johnsmith", "KeanuReeves", "Elias", "priyasubramanium", "Devashish"])
print(names)
names_transform = names.apply(lambda name: string_vectorizer(name, alphabet_listset, max_name_len, False).reshape(1, 20, 63))
names_transform = np.vstack(names_transform.tolist())
prediction = ethnicitymodel.predict(names_transform)
print("array([[other, indian ethnicity]]) probability")
prediction = [[int(pred[0]*100)/100, int(pred[1]*100)/100] for pred in prediction]
print(np.array(prediction))


# In[ ]:


# Star this repo to add to your favorite repos or fork the code!