-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpredict_gender_and_ethnicity.py
More file actions
89 lines (58 loc) · 2.51 KB
/
predict_gender_and_ethnicity.py
File metadata and controls
89 lines (58 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# coding: utf-8
# # Gender Usernames/Names
# In[1]:
import numpy as np
import pandas as pd
#from tqdm import tqdm
#tqdm.pandas()
np.random.seed(7)
import tensorflow as tf
load_model = tf.keras.models.load_model
adam = tf.keras.optimizers.Adam(lr=1e-3)
import string
alphabet_list = list(string.ascii_lowercase)
max_name_len = 20
import re
def string_vectorizer(strng, alphabet, max_str_len=20, gender=True):
if(gender):
strng = re.sub(r"[^a-z]+", "", strng.lower())
else:
strng = re.sub(r"[^a-zA-z0-9-]+", "", strng)
vector = [[0 if char != letter else 1 for char in alphabet] for letter in strng[0:max_str_len]]
while len(vector) != max_str_len:
vector = [*vector, [0 for char in alphabet]]
return np.array(vector)
gendermodel = load_model("gendermodel.h5")
gendermodel.load_weights("genderweights.h5")
gendermodel.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
# 90% accuracy on 30k female and 33k male names synthesised from Indian names list
gendermodel.summary()
# In[2]:
names = pd.Series(["RajSood25", "mukta57cute", "burnvipulkumarfire", "priyasubramanium"])
print(names)
names_transform = names.apply(lambda name: string_vectorizer(name, alphabet_list, max_name_len).reshape(1, 20, 26))
names_transform = np.vstack(names_transform.tolist())
prediction = gendermodel.predict(names_transform)
print("array([[male, female]]) probability")
prediction = [[int(pred[0]*100)/100, int(pred[1]*100)/100] for pred in prediction]
print(np.array(prediction))
# # Ethnicity Usernames
# In[4]:
alphabet_listset = pd.read_csv("ethnicity_listset.csv")["characters"].tolist()
print(alphabet_listset[0:5])
ethnicitymodel = load_model("eth_model.h5")
ethnicitymodel.load_weights("eth_weights.h5")
ethnicitymodel.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
# 88% accuracy on Indian vs non indian usernames
ethnicitymodel.summary()
# In[5]:
names = pd.Series(["RajkumarSood", "mukta57", "johnsmith", "KeanuReeves", "Elias", "priyasubramanium", "Devashish"])
print(names)
names_transform = names.apply(lambda name: string_vectorizer(name, alphabet_listset, max_name_len, False).reshape(1, 20, 63))
names_transform = np.vstack(names_transform.tolist())
prediction = ethnicitymodel.predict(names_transform)
print("array([[other, indian ethnicity]]) probability")
prediction = [[int(pred[0]*100)/100, int(pred[1]*100)/100] for pred in prediction]
print(np.array(prediction))
# In[ ]:
# Star this repo to add to your favorite repos or fork the code!