Email Classification

Fetching data from MS-Sql

!apt install unixodbc-dev
!pip install pyodbc

%%sh
curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
curl https://packages.microsoft.com/config/ubuntu/16.04/prod.list > /etc/apt/sources.list.d/mssql-release.list
sudo apt-get update
sudo ACCEPT_EULA=Y apt-get -q -y install msodbcsql17

import os
import pyodbc
import urllib
import pandas as pd
from sqlalchemy import create_engine

driver = [item for item in pyodbc.drivers()][-1]
conn_string = f'Driver={driver};Server=tcp:server.<domain>.com,<port>;Database=<db>;Uid=<userid>;Pwd=<pass>;Encrypt=yes;TrustServerCertificate=yes;Connection Timeout=30;'
conn = pyodbc.connect(conn_string)
cursor = conn.cursor()

# params = urllib.parse.quote_plus(conn_string)
# conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)
# engine_feat = create_engine(conn_str, echo=True)
# print(engine_feat.table_names())

tname = 'tbl_Final_Lable_Data_18_n_19'
query = f'select count(*) from {tname}'

cursor.execute(query)
cursor.fetchall()

query = f'select top 5 * from {tname}'
df = pd.read_sql(query, conn)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 aglobalcaseid         5 non-null      int64 
 Team In               5 non-null      object
 Department            5 non-null      object
 QueryType             5 non-null      object
 SubQueryType          5 non-null      object
 Comment               5 non-null      object
 OriginalQuery         5 non-null      object
 OriginalSubQuery      5 non-null      object
 tSubject              5 non-null      object
 mMsgContent           5 non-null      object
UmMsgContent          0 non-null      object
MasterDepartment      5 non-null      object
nCaseState            0 non-null      object
nCreatedFromMedia     5 non-null      int64 
Interaction_category  5 non-null      object
LastTeamName          0 non-null      object
Rating                5 non-null      object
AcustId               5 non-null      int64 
dtypes: int64(3), object(15)
memory usage: 848.0+ bytes

%reload_ext google.colab.data_table

df

df.columns

Index(['aglobalcaseid', 'Team In', 'Department', 'QueryType', 'SubQueryType',
       'Comment', 'OriginalQuery', 'OriginalSubQuery', 'tSubject',
       'mMsgContent', 'UmMsgContent', 'MasterDepartment', 'nCaseState',
       'nCreatedFromMedia', 'Interaction_category', 'LastTeamName', 'Rating',
       'AcustId'],
      dtype='object')

query = f'select tSubject, mMsgContent, QueryType, SubQueryType from {tname}'
df = pd.read_sql(query, conn)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930913 entries, 0 to 930912
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   tSubject      930913 non-null  object
 1   mMsgContent   716024 non-null  object
 2   QueryType     930913 non-null  object
 3   SubQueryType  930913 non-null  object
dtypes: object(4)
memory usage: 28.4+ MB

df.to_pickle('data.p')

Wrangling

# wrangling.py
import os
import numpy as np
import pandas as pd
spath = '/content/email_class'
df = pd.read_pickle(os.path.join(spath,'data','raw','data.p'))
df.columns = ['subj','msg','qtype','stype']
df['type'] = df['qtype'] + ' | ' + df['stype']
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(how='all')
df = df.drop_duplicates()
df = df.dropna(subset=['subj', 'msg'], how='all')
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(how='all')
df = df.drop_duplicates()
df = df.dropna(subset=['subj', 'msg'], how='all')
df = df.fillna(' ')
df['subj&msg'] = df['subj'] + ' sub_eos_token ' + df['msg']
df = df[['subj&msg','type']]
df.columns = ['text','target']
df.sample(10000).to_pickle('df_raw_wrangled_sample_10k.p')
# df.sample(10000).to_pickle(os.path.join(spath,'data','wrangled','df_raw_wrangled_sample_10k.p'))
# df.to_pickle(os.path.join(spath,'data','wrangled','df_raw_wrangled_full.p'))

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

df = pd.read_pickle(os.path.join(spath,'data','raw','data.p'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930913 entries, 0 to 930912
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   tSubject      930913 non-null  object
 1   mMsgContent   716024 non-null  object
 2   QueryType     930913 non-null  object
 3   SubQueryType  930913 non-null  object
dtypes: object(4)
memory usage: 28.4+ MB

df.sample(20)

df.columns

df.columns = ['subj','msg','qtype','stype']

df.qtype.nunique()

df.qtype.value_counts()[:50]

df.stype.nunique()

df.stype.value_counts()[:50]

df['type'] = df['qtype'] + ' | ' + df['stype']

df['type'].nunique()

df['type'].value_counts()[:50]

df.subj.nunique()

df.subj.value_counts()[:50]

df.msg.nunique()

df.msg.value_counts()[:50]

df[df.msg.isnull()].sample(10)

df[(df.msg.isnull()) & (df.subj.isnull())].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   subj    0 non-null      object
 1   msg     0 non-null      object
 2   qtype   0 non-null      object
 3   stype   0 non-null      object
dtypes: object(4)
memory usage: 0.0+ bytes

df2 = df.replace(r'^\s*$', np.nan, regex=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930913 entries, 0 to 930912
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   subj    907099 non-null  object
 1   msg     712917 non-null  object
 2   qtype   930913 non-null  object
 3   stype   585435 non-null  object
 4   type    930913 non-null  object
dtypes: object(5)
memory usage: 35.5+ MB

df3 = df2.dropna(how='all')
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 930913 entries, 0 to 930912
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   subj    907099 non-null  object
 1   msg     712917 non-null  object
 2   qtype   930913 non-null  object
 3   stype   585435 non-null  object
 4   type    930913 non-null  object
dtypes: object(5)
memory usage: 42.6+ MB

df4 = df3.drop_duplicates()
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 873726 entries, 0 to 930912
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   subj    860490 non-null  object
 1   msg     684972 non-null  object
 2   qtype   873726 non-null  object
 3   stype   557609 non-null  object
 4   type    873726 non-null  object
dtypes: object(5)
memory usage: 40.0+ MB

df4[(df4.msg.isnull()) & (df4.subj.isnull())]

df5 = df4.dropna(subset=['subj', 'msg'], how='all')
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 873578 entries, 0 to 930912
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   subj    860490 non-null  object
 1   msg     684972 non-null  object
 2   qtype   873578 non-null  object
 3   stype   557569 non-null  object
 4   type    873578 non-null  object
dtypes: object(5)
memory usage: 40.0+ MB

df4.shape, df5.shape

((873726, 5), (873578, 5))

sample = df5.sample(10)
sample

!pip install ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
text_processor = TextPreProcessor(
  normalize=['url', 'email', 'percent', 'money', 'phone', 
              'user', 'time', 'date', 'number'],
  # annotate={"hashtag", "allcaps", "elongated", "repeated",
  #           'emphasis', 'censored'},
  fix_html=True,
  segmenter="twitter",
  corrector="twitter", 
  unpack_hashtags=True,
  unpack_contractions=True,
  spell_correct_elong=False,
  tokenizer=SocialTokenizer(lowercase=False).tokenize,
  dicts=[emoticons]
  )

import re
from bs4 import BeautifulSoup

# text_raw = sample.loc[[299500]].msg.tolist()[0]
# text = text_raw
# text = BeautifulSoup(text, "lxml").text
# text = re.sub(r'<.*?>', ' ', text)
# text = re.sub(r'\{[^{}]*\}', ' ', text)
# text = re.sub(r'\s', ' ', text)
# text = re.sub(r'.*\..*ID.*?(?=\s)', ' ', text)
# text = re.sub(r'DIV..*?(?=\s)', ' ', text)
# text = BeautifulSoup(text, "lxml").text
# text = text.strip()
# text = " ".join(text_processor.pre_process_doc(text))
# text

from itertools import groupby

html_residual = 'P . ImprintUniqueID LI . ImprintUniqueID DIV . ImprintUniqueID TABLE . ImprintUniqueIDTable DIV . Section '
caution_residual =  'CAUTION This email originated from outside of the organization . Do not click links or open attachments unless you recognize the sender and know the content is safe . '

def clean_text(text):
  text = ' ' + text + ' '
  text = BeautifulSoup(text, "lxml").text
  text = re.sub(r'<.*?>', ' ', text)
  text = re.sub(r'\{[^{}]*\}', ' ', text)
  text = re.sub(r'\s', ' ', text)
  # text = re.sub(r'(?=\s).*\..*ID.*?(?=\s)', ' ', text)
  # text = re.sub(r'(?=\s)DIV..*?(?=\s)', ' ', text)
  text = re.sub(r'Forwarded message.*?(?=____)', ' ', text)
  text = BeautifulSoup(text, "lxml").text
  text = ' '.join(text_processor.pre_process_doc(text))
  text = re.sub(r'[^A-Za-z<>. ]', ' ', text)
  text = ' '.join(text.split())
  text = re.sub(html_residual, '', text)
  text = re.sub(caution_residual, '', text)
  text = re.sub(r'(?:\d+[a-zA-Z]+|[a-zA-Z]+\d+)', '<hash>', text)
  # text = re.sub(r'\b\w{1,2}\b', '', text)
  text = ' '.join(text.split())
  text = ' '.join([k for k,v in groupby(text.split())])
  return text

# # text_raw = sample.loc[[75806]].msg.tolist()[0]
# text = text_raw
# text = BeautifulSoup(text, "lxml").text
# text = re.sub(r'<.*?>', ' ', text)
# text = re.sub(r'\{[^{}]*\}', ' ', text)
# text = re.sub(r'\s', ' ', text)
# # text = re.sub(r'.*\..*ID.*?(?=\s)', ' ', text)
# # text = re.sub(r'DIV..*?(?=\s)', ' ', text)
# text = re.sub(r'Forwarded message.*?(?=____)', ' ', text)
# text = BeautifulSoup(text, "lxml").text
# text = " ".join(text_processor.pre_process_doc(text))
# text = re.sub(r'[^A-Za-z0-9<>. ]', ' ', text)
# text = ' '.join(text.split())
# text

sample = df5.sample(1000)

sample['subj_clean'] = sample['subj'].fillna(' ').apply(clean_text)

[(x,y) for x,y in zip(sample.subj.tolist()[:50],sample.subj_clean.tolist()[:50])]

sample['msg_clean'] = sample['msg'].fillna(' ').apply(clean_text)
sample.msg.tolist()

sample.msg_clean.tolist()

sample = df5.sample(1000, random_state=40)
sample['subj_clean'] = sample['subj'].fillna(' ').apply(clean_text)
sample['msg_clean'] = sample['msg'].fillna(' ').apply(clean_text)
sample['subj&msg'] = sample['subj_clean'] + ' | ' + sample['msg_clean']
sample = sample[['subj&msg','type']]
sample.columns = ['text','target']
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 36966 to 574457
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   target  1000 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB

sample

sample = df5.copy()
sample['subj_clean'] = sample['subj'].fillna(' ').apply(clean_text)
sample['msg_clean'] = sample['msg'].fillna(' ').apply(clean_text)
sample['subj&msg'] = sample['subj_clean'] + ' | ' + sample['msg_clean']
sample = sample[['subj&msg','type']]
sample.columns = ['text','target']
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 873578 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    873578 non-null  object
 1   target  873578 non-null  object
dtypes: object(2)
memory usage: 20.0+ MB

sample.nunique()

text      803255
target       500
dtype: int64

sample2 = sample.drop_duplicates()
sample2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 820778 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    820778 non-null  object
 1   target  820778 non-null  object
dtypes: object(2)
memory usage: 18.8+ MB

sample3 = sample2.replace(r'^\s*$', np.nan, regex=True)
sample3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 820778 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    820778 non-null  object
 1   target  820778 non-null  object
dtypes: object(2)
memory usage: 18.8+ MB

df5.to_pickle('df_raw_wrangled.p')

sample2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 820778 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    820778 non-null  object
 1   target  820778 non-null  object
dtypes: object(2)
memory usage: 18.8+ MB

sample2.nunique()

text      803255
target       500
dtype: int64

sample2.describe()

Text Cleaning

import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer() 

from nltk.stem import PorterStemmer
ps = PorterStemmer()

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

df = pd.read_pickle(os.path.join(path,'data','wrangled','df_raw_wrangled_sample_10k.p'))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 930773 to 629839
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10000 non-null  object
 1   target  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB

df.sample(5)

df[df.target=='<redacted>'].sample(5)

df.target.value_counts()

# # cleaning pipe
# - lowercase
# - remove nonalpha
# - stopword
# - lemmatization
# - stemming

# - min occurence
# - max occurence
# - ngram
# - misspell
# - contraction

# - encode plus tokenizer

df = df.reset_index(drop=True)

# label tokens
caution_label = 'CAUTION: This email originated from outside of the organization. \
Do not click links or open attachments unless you recognize the sender and know the \
content is safe'

confidential_label = '<redacted>'

confidential_label = '<redacted>'

retransmit_label = '<redacted>'

alert_label = '<redacted>'

html_labels = ['P.ImprintUniqueID', 'LI.ImprintUniqueID', 'DIV.ImprintUniqueID',
              'TABLE.ImprintUniqueIDTable', 'DIV.Section1']
html_regex = re.compile('|'.join(map(re.escape, html_labels)))

newline_token = '\n'


custom_stopwords = ['best', 'regard', 'direct', 'number', 'phone', 'mobile', 'number', 'reply', 'url', 'com']

!pip install clean-text

def clean_l1(text):
  text = re.sub(caution_label, ' cautionlabel ', text)
  text = re.sub(confidential_label, ' confidentiallabel ', text)
  text = html_regex.sub('htmltoken', text)
  text = re.sub(retransmit_label, ' retransmittoken ', text)
  text = re.sub(alert_label, ' alerttoken ', text)
  text = re.sub('sub_eos_token', ' bodytoken ', text)
  text = ' ' + text + ' '
  text = BeautifulSoup(text, "lxml").text
  text = re.sub(r'<.*?>', ' ', text)
  text = re.sub(r'\{[^{}]*\}', ' ', text)
  # # text = re.sub(r'Forwarded message.*?(?=____)', ' ', text)
  text = BeautifulSoup(text, "lxml").text
  text = re.sub(newline_token, ' newlinetoken ', text)
  text = ' '.join(text.split())
  text = re.sub(r'[^A-Za-z.,?\'@]', ' ', text)
  

  # text = ' '.join(text.split())
  return text

xx = clean_l1(df.loc[idx,'text']); xx
# print(xx)

df.loc[idx,'text']

idx = np.random.randint(0,len(df))
print(idx)
print(df.loc[idx,'text'])

# idx = 9
# print(df.text.iloc[[idx]].tolist()[0])
# xx = df.text.iloc[[idx]].apply(clean_l1).tolist()[0]
# xx

df['text_clean_l1'] = df.text.apply(clean_l1)

df.text_clean_l1.sample().tolist()[0]

set1_words = ['best regards', 'regards', 'thanks regards', 'warm regards']
set1_regex = re.compile('|'.join(map(re.escape, set1_words)))

from itertools import groupby

def replace_words(s, words):
  for k, v in words.items():
      s = s.replace(k, v)
  return s

word_mapping = {' f o ':' fno ',
                ' a c ':' account ',
                ' a/c ':' account ',
                ' fw ':' forward ',
                ' fwd ':' forward ',
                ' forwarded ':' forward ',
                ' no. ':' number ',
                }

def clean_l2(text):
  text = ' ' + text + ' '
  text = text.lower()
  text = ' '.join(text.split())
  text = replace_words(text, word_mapping)
  text = re.sub('[.]', ' . ', text)
  text = re.sub('[,]', ' , ', text)
  text = re.sub('[?]', ' ? ', text)
  text = ' '.join([w for w in text.split() if re.match('^[a-z.,?\'\-\~#`!&*()]+$', w)])
  text = re.sub(r'[^a-z.,?\']', ' ', text)
  text = set1_regex.sub('eostoken', text)
  text = text + ' eostoken '
  text = re.match(r'^.*?eostoken', text).group(0)
  text = re.sub(r'eostoken', '', text)
  text = re.sub(r'\b\w{1,1}\b', '', text)
  text = ' '.join([k for k,v in groupby(text.split())])
  text = ' '.join(text.split())
  return text

xxy = 'warm regards regrads hello regards'
xxy = ' '.join([w for w in xxy.split() if re.match('^[a-z.,?\'\-\~#`!&*()]+$', w)])
xxy = re.sub(r'[^a-z.,?\']', ' ', xxy)
xxy = set1_regex.sub('eostoken', xxy)
re.match(r'^.*?eostoken', xxy).group(0)

'warm eostoken'

idx = 1
# print(df.text.iloc[[idx]].tolist()[0])
print(df.text_clean_l1.iloc[[idx]].tolist()[0])
df.text_clean_l1.iloc[[idx]].apply(clean_l2).tolist()[0]

df['text_clean_l2'] = df.text_clean_l1.apply(clean_l2)

df.drop_duplicates().replace(r'^\s*$', np.nan, regex=True).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 930773 to 629839
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           10000 non-null  object
 1   target         10000 non-null  object
 2   text_clean_l1  10000 non-null  object
 3   text_clean_l2  9999 non-null   object
dtypes: object(4)
memory usage: 390.6+ KB

xx = df.text_clean_l2.apply(lambda x: len(x.split())).values
sns.distplot(xx[xx<1000])
xx.min(), xx.max()

png

# print(np.argmax(-xx))
print(list(np.argsort(-xx))[:20])
df.iloc[[374]]

df1 = df[(df.text_clean_l2.apply(lambda x: len(x.split()))>3) & (df.text_clean_l2.apply(lambda x: len(x.split()))<200)]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9012 entries, 930773 to 629839
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           9012 non-null   object
 1   target         9012 non-null   object
 2   text_clean_l1  9012 non-null   object
 3   text_clean_l2  9012 non-null   object
dtypes: object(4)
memory usage: 352.0+ KB

def clean_l3(text):
  text = re.sub(r'[^a-z]', '', text)
  text = ' '.join([lemmatizer.lemmatize(w, 'v') for w in text.split()])
  text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
  text = ' '.join([w for w in text.split() if not w in stopwords])
  text = ' '.join([w for w in text.split() if not w in custom_stopwords])
  # seen = set()
  # seen_add = seen.add
  # text = ' '.join([x for x in text.split() if not (x in seen or seen_add(x))])
  text = ' '.join([ps.stem(w) for w in text.split()])
  return text

# def temp(text):
#   text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
#   text = ' '.join([lemmatizer.lemmatize(w, 'j') for w in text.split()])
#   text = ' '.join([lemmatizer.lemmatize(w, 'V') for w in text.split()])
#   text = ' '.join([lemmatizer.lemmatize(w, 'R') for w in text.split()])
#   return text

# # temp('communicate communication')

# import spacy
# from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
# lemmatizer = nlp.vocab.morphology.lemmatizer
# lemmatizer('communicate communication', VERB)

# from nltk.stem import PorterStemmer
# ps = PorterStemmer()
# for w in ['commute', 'communication']:
#     rootWord=ps.stem(w)
#     print(rootWord)

df['text_clean_l2'] = df.text_clean_l1.apply(clean_l2)

df.text_clean_l2.sample(5, random_state=10).tolist()

# import spacy
# nlp = spacy.load("en_core_web_sm")

# def ners(text):
#   doc = nlp(text)
  # for token in doc:
  #   print(token.text)
  # x = list(set([ent.text for ent in doc.ents if ent.label_=='ORG']))
  # x = list(set([(ent.text,ent.label_) for ent in doc.ents]))
  # return x

# df.sample(20).text.apply(ners).tolist()
# df.text.iloc[[14]].apply(ners)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, min_df=10, ngram_range=(1,3))
vectorizer.fit_transform(df.clean.tolist())

<10000x5456 sparse matrix of type '<class 'numpy.int64'>'
    with 270546 stored elements in Compressed Sparse Row format>

idx = 100
print(df.text.iloc[[idx]].tolist()[0])
pd.DataFrame(vectorizer.inverse_transform(vectorizer.transform([df.clean.tolist()[idx]]))).T[0]

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=10, ngram_range=(1,2))
X = vect.fit_transform(df.clean.tolist())

def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df
def top_feats_in_doc(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

features = vect.get_feature_names()

idx = 14
print(df.text.iloc[[idx]].tolist()[0])
print(top_feats_in_doc(X, features, idx, 10))

df[df.clean.str.contains('vora')]

Transformer model

!pip install -q clean-text[gpl] && cp '/content/drive/My Drive/clean_v2.py' .
from clean_v2 import clean_l1

import os
import re

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

import csv

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

df_raw = pd.read_pickle(os.path.join(path,'data_clean_v2.p'))
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824410 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    824410 non-null  object
 1   target  824410 non-null  object
dtypes: object(2)
memory usage: 18.9+ MB

df = df_raw.sample(10000, random_state=42)

tokenlist = ['emailtoken', 'urltoken', 'newlinetoken', 'htmltoken', 'currencytoken', 'token', 'digittoken', 'numbertoken']
def preprocess(text):
  text = text.lower()
  text = ' '.join([w for w in text.split() if w not in tokenlist])
  text = ' '.join(text.split()[:50])
  return text
df['text'] = df.text.apply(preprocess)
df = df[df.text.apply(lambda x: len(x.split()))>3]

df['target'] = df.target.apply(clean_l1).str.lower().str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ')
minority_labels = df.target.value_counts()[df.target.value_counts()<100].index.tolist()
df['target'] = df.target.replace(dict.fromkeys(minority_labels, 'other'))

df = df[df.target!='other']
df.target.value_counts()[:25]

target_map = {'<redacted>': '<redacted>'}
df = df.replace({'target': target_map})
df.target.value_counts()[:25]

label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

df.head()

with open('label.csv', 'w') as f:
    wr = csv.writer(f,delimiter="\n")
    wr.writerow(df.target.unique().tolist())

train, val = train_test_split(df, test_size=0.2, random_state=42)
train.reset_index(drop=True).to_csv('train.csv')
val.reset_index(drop=True).to_csv('val.csv')

!pip install -q fast-bert

from fast_bert.data_cls import BertDataBunch

databunch = BertDataBunch('/content', '/content',
                          tokenizer='distilbert-base-uncased',
                          train_file='train.csv',
                          val_file='val.csv',
                          label_file='label.csv',
                          text_col='text',
                          label_col='target',
                          batch_size_per_gpu=16,
                          max_seq_length=100,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='distilbert')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy
import logging
import torch

logger = logging.getLogger()
device_cuda = 'cuda' if torch.cuda.is_available() else 'cpu'
metrics = [{'name': 'accuracy', 'function': accuracy}]

learner = BertLearner.from_pretrained_model(
                        databunch,
                        pretrained_path='distilbert-base-uncased',
                        metrics=metrics,
                        device=device_cuda,
                        logger=logger,
                        output_dir='/content',
                        finetuned_wgts_path=None,
                        warmup_steps=500,
                        multi_gpu=False,
                        is_fp16=True,
                        multi_label=False,
                        logging_steps=50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

learner.lr_find(start_lr=1e-4,optimizer_type='lamb')

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Stopping early, the loss has diverged
Learning rate search finished. See the graph with {finder_name}.plot()

png

learner.plot(show_lr=2e-2)

png

 learner.fit(epochs=1,
            lr=2e-2,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="lamb",
            return_results=True)

(206,
 1.1582254237920335,
 [{'accuracy': 0.5176184690157959, 'loss': 1.860094638971182}])

learner.validate()

{'accuracy': 0.5820170109356014, 'loss': 1.515815230516287}

learner.save_model()

xx = val.sample(5); xx

predictions = learner.predict_batch(xx.text.tolist())
pd.DataFrame(predictions).T

# from fast_bert.prediction import BertClassificationPredictor
# MODEL_PATH = '/content/model_out'

# predictor = BertClassificationPredictor(
#               model_path='/content',
#               label_path='/content',
#               multi_label=False,
#               model_type='xlnet',
#               do_lower_case=True)

# single_prediction = predictor.predict("just get me result for this text")
# texts = ["this is the first text", "this is the second text"]
# multiple_predictions = predictor.predict_batch(texts)

train, val = train_test_split(df, test_size=0.2, random_state=42)

train = train.reset_index(drop=True)
train.columns = ['text','labels']

val = val.reset_index(drop=True)
val.columns = ['text','labels']

!pip install -q simpletransformers

from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model_args = ClassificationArgs(num_train_epochs=1, learning_rate=1e-2)

model = ClassificationModel('distilbert', 'distilbert-base-uncased', num_labels=df.target.nunique(), args=model_args)

model.train_model(train)

scores1, model_outputs, wrong_predictions = model.eval_model(val)

scores1

from sklearn.metrics import f1_score, accuracy_score
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

scores2, model_outputs, wrong_predictions = model.eval_model(val, f1=f1_multiclass, acc=accuracy_score)

scores2

predictions, raw_output  = model.predict(['<redacted>'])

TFIDF model

!pip install -q clean-text[gpl] && cp '/content/drive/My Drive/clean_v2.py' .

from clean_v2 import clean_l1

import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

from nltk.stem import PorterStemmer
ps = PorterStemmer()

from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

df_raw = pd.read_pickle(os.path.join(path,'data_clean_v2.p'))
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824410 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    824410 non-null  object
 1   target  824410 non-null  object
dtypes: object(2)
memory usage: 18.9+ MB

df = df_raw.sample(10000, random_state=42)

def clean_l2(text):
  text = text.lower()
  text = re.sub(r'[^a-z ]', '', text)
  text = ' '.join([lemmatizer.lemmatize(w, 'v') for w in text.split()])
  text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
  text = ' '.join([w for w in text.split() if not w in stopwords])
  text = ' '.join([ps.stem(w) for w in text.split()])
  return text

from collections import OrderedDict
df['target'] = df.target.apply(clean_l1).str.lower().str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ')

df['text'] = df['text'].apply(clean_l2)

df = df[df.text.apply(lambda x: len(x.split()))>3]

xx = df.sample(5, random_state=40)
xx

import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

df['text'].apply(lambda x: len(x.split(' '))).sum()

minority_labels = df.target.value_counts()[df.target.value_counts()<100].index.tolist()
df['target'] = df.target.replace(dict.fromkeys(minority_labels, 'Other'))
df = df[df.target!='Other']

X = df.text
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

label_list = list(df.target.unique())

%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=label_list))

from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=5, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=42,
                               shuffle=True, tol=None, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False)

%%time
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=label_list))

from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=1, penalty='l2',
                                    random_state=None, solver='lbfgs',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

%%time
y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=label_list))

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

X_train, X_test, y_train, y_test = train_test_split(df.text, df.target, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 4348/4348 [00:00<00:00, 2353747.26it/s]

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

y_pred = logreg.predict(test_vectors_dbow)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=label_list))

import itertools
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

train_size = int(len(df) * .7)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 3043
Test size: 1305

train_posts = df['text'][:train_size]
train_tags = df['target'][:train_size]

test_posts = df['text'][train_size:]
test_tags = df['target'][train_size:]

max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (3043, 1000)
x_test shape: (1305, 1000)
y_train shape: (3043, 21)
y_test shape: (1305, 21)

batch_size = 32
epochs = 2

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/2
86/86 [==============================] - 1s 7ms/step - loss: 2.3918 - accuracy: 0.3561 - val_loss: 1.8259 - val_accuracy: 0.5574
Epoch 2/2
86/86 [==============================] - 0s 5ms/step - loss: 1.4318 - accuracy: 0.6402 - val_loss: 1.2779 - val_accuracy: 0.6557

score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

41/41 [==============================] - 0s 2ms/step - loss: 1.3431 - accuracy: 0.6368
Test accuracy: 0.636781632900238

def checkinside(V, T):
  lAB = ((V[1][0] - V[0][0])**2 + (V[1][1] - V[0][1])**2)**(0.5)
  lBC = ((V[2][0] - V[1][0])**2 + (V[2][1] - V[1][1])**2)**(0.5)
  uAB = ((V[1][0] - V[0][0]) / lAB, (V[1][1] - V[0][1]) / lAB)
  uBC = ((V[2][0] - V[1][0]) / lBC, (V[2][1] - V[1][1]) / lBC)
  BP = ((T[0][0] - V[1][0]), (T[0][1] - V[1][1]))
  SignedDistABP = BP[0] * uAB[1] - BP[1] * uAB[0]
  SignedDistBCP = - BP[0] * uBC[1] + BP[1] * uBC[0]
  result = 'inside' if ((SignedDistABP*SignedDistBCP > 0) and \
                        (abs(SignedDistABP) <= lBC) and \
                        abs(SignedDistBCP) <= lAB) \
                        else 'not inside'
  return result

V = [(670273, 4879507), (677241, 4859302), (670388, 4856938), (663420, 4877144)]
T = [(670831, 4867989), (675097, 4869543)]
print(checkinside(V,[T[0]]))
print(checkinside(V,[T[1]]))

not inside
inside

FastText model

!pip install -q clean-text[gpl] && cp '/content/drive/My Drive/clean_v2.py' .

from clean_v2 import clean_l1

import os
import re

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

df_raw = pd.read_pickle(os.path.join(path,'data_clean_v2.p'))
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824410 entries, 0 to 930912
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    824410 non-null  object
 1   target  824410 non-null  object
dtypes: object(2)
memory usage: 18.9+ MB

df = df_raw.sample(100000, random_state=42)

# !pip install fasttext

# preprocessing
# lowercase
# remove tokens
# truncate post-regards
# lower th remove
# upper th truncate
# clean categories
# collate categories
# train test split

# tokenlist = ' '.join([i for i in df['text']]).split()
# tokenlist = list(set([w for w in tokenlist if 'token' in w]))
tokenlist = ['emailtoken', 'urltoken', 'htmltoken', 'currencytoken', 'token', 'digittoken', 'numbertoken']

def preprocess(text):
  text = text.lower()
  text = ' '.join([w for w in text.split() if w not in tokenlist])
  text = ' '.join(text.split()[:50])
  return text

print(tokenlist)

['emailtoken', 'urltoken', 'htmltoken', 'currencytoken', 'token', 'digittoken', 'numbertoken']

# xx = df.sample()
# xx
df['text'] = df.text.apply(preprocess)
df = df[df.text.apply(lambda x: len(x.split()))>3]
# preprocess(xx.text.tolist()[0])

# from collections import OrderedDict
df['target'] = df.target.apply(clean_l1).str.lower().str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ')

df.sample(5)

df.target.value_counts()[:20]

# sns.distplot(df.target.value_counts())

minority_labels = df.target.value_counts()[df.target.value_counts()<100].index.tolist()
df['target'] = df.target.replace(dict.fromkeys(minority_labels, 'Other'))

df = df[df.target!='Other']
sns.distplot(df.target.value_counts());

png

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df.target)

df.isna().any()

text      False
target    False
dtype: bool

df['target'] = ['__label__'+str(s) for s in df['target']]
df = df[['target','text']]

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

import csv
train.to_csv('train.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test.to_csv('test.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

# train.sample(5).to_csv('sample.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

# import fasttext
# model = fasttext.train_supervised(input='train.txt', epoch=50) --> (17874, 0.5248405505203089, 0.5248405505203089)
# model = fasttext.train_supervised(input='train.txt', epoch=50, lr=0.5, wordNgrams=2, loss='hs') --> (17874, 0.46620789974264293, 0.46620789974264293)
# model = fasttext.train_supervised(input='train.txt', --> (17874, 0.4858453619782925, 0.4858453619782925)
#                                   epoch=25,
#                                   lr=0.2,
#                                   loss='hs',
#                                   autotuneMetric='f1',
#                                   verbose=5,
#                                   minCount=10,
#                                   )
# model = fasttext.train_supervised(input='train.txt', --> (17874, 0.5262392301667226, 0.5262392301667226)
#                                   epoch=50,
#                                   lr=0.1,
#                                   loss='softmax',
#                                   autotuneMetric='f1',
#                                   verbose=5,
#                                   minCount=20,
#                                   )
model = fasttext.train_supervised(input='train.txt', 
                                  epoch=50,
                                  lr=0.1,
                                  loss='softmax',
                                  autotuneMetric='f1',
                                  verbose=5,
                                  minCount=20,
                                  )

model.test("test.txt", k=1)

(17874, 0.5262392301667226, 0.5262392301667226)

model.test("test.txt", k=5)

(17874, 0.15201740752253654, 0.6840102942821976)

xx = df.sample(); xx

model.predict(xx.text.tolist()[0], k=5)

(('__label__99', '__label__55', '__label__165', '__label__83', '__label__46'),
 array([1.00007915e+00, 1.08352187e-05, 1.00006037e-05, 1.00005846e-05,
        1.00005755e-05]))

Pipeline

!pip install -q fast-bert
!pip install -q fasttext
!pip install -q clean-text[gpl]

# setup
import os
import pickle
import shutil
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from collections import OrderedDict

warnings.filterwarnings("ignore")
Path(work_path).mkdir(parents=True, exist_ok=True)
os.chdir(work_path)

shutil.copyfile(os.path.join(save_path,'utils_clean.py'), os.path.join(work_path,'utils_clean.py'))
from utils_clean import clean_l1, clean_l2

shutil.copyfile(os.path.join(save_path,'utils_preprocess.py'), os.path.join(work_path,'utils_preprocess.py'))
from utils_preprocess import *

shutil.copyfile(os.path.join(save_path,'label_encoder.p'), os.path.join(work_path,'label_encoder.p'))
label_encoder = pickle.load(open('label_encoder.p', 'rb'))

shutil.copyfile(os.path.join(save_path,'label_map.p'), os.path.join(work_path,'label_map.p'))
label_map = pickle.load(open('label_map.p', 'rb'))

import fasttext
shutil.copyfile(os.path.join(save_path,'fasttext.bin'), os.path.join(work_path,'fasttext.bin'))
model_fasttext = fasttext.load_model('fasttext.bin')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
vectorizer = CountVectorizer(ngram_range=(1,1))
shutil.copyfile(os.path.join(save_path,'model_countvectorizer_large.p'), os.path.join(work_path,'model_countvectorizer.p'))
model_countvectorizer = pickle.load(open('model_countvectorizer.p', 'rb'))
dtmx = model_countvectorizer['dtm']
vectorizerx = model_countvectorizer['vectorizer']
target_categories = model_countvectorizer['target_categories']
target_labels = model_countvectorizer['target_labels']

shutil.copyfile(os.path.join(save_path,'model_tfidf_large.p'), os.path.join(work_path,'model_tfidf.p'))
model_tfidf = pickle.load(open(os.path.join(work_path,'model_tfidf.p'), 'rb'))

from fast_bert.prediction import BertClassificationPredictor
shutil.unpack_archive(os.path.join(save_path,'fastbert_large_iter2.zip'), work_path, 'zip')
MODEL_PATH = os.path.join(work_path, 'model_out')
model_fastbert = BertClassificationPredictor(
                    model_path=MODEL_PATH,
                    label_path=work_path,
                    multi_label=False,
                    model_type='distilbert',
                    do_lower_case=True)

X = pd.DataFrame(label_map, index=[0]).T.reset_index()
X.columns = ['Raw_data_labels','processed_labels']
X.to_csv('')

df_raw = pd.read_pickle(os.path.join('/content/drive/My Drive/','data','raw','data_raw.p'))
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930913 entries, 0 to 930912
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   tSubject      930913 non-null  object
 1   mMsgContent   716024 non-null  object
 2   QueryType     930913 non-null  object
 3   SubQueryType  930913 non-null  object
dtypes: object(4)
memory usage: 28.4+ MB

df_raw['labels_raw'] = df_raw['QueryType'] + ' | ' + df_raw['SubQueryType']
df_raw['labels_raw'].value_counts().to_csv('labels_rawdata.csv')

# test_set = df_raw.sample(50000, random_state=10)
# def preprocess(X):
#   X = X.drop_duplicates()
#   X = X.dropna(subset=['tSubject', 'mMsgContent'], how='all')
#   X['type'] = X['QueryType'] + ' | ' + X['SubQueryType']
#   X['type'] = X['type'].fillna(' ')
#   X['type_orig'] = X['type']
#   X['type'] = X['type'].apply(clean_l1).str.lower().str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ').apply(clean_l1)
#   X = X[X['type'].isin(list(label_encoder.classes_))]
#   X['subj&msg'] = X['tSubject'] + ' sub_eos_token ' + X['mMsgContent']
#   X = X[['subj&msg','type', 'type_orig']]
#   X.columns = ['text','target', 'type_orig']
#   X = X.dropna()
#   return X
# test_set = preprocess(test_set)
# test_set.describe()

# label_map = test_set[['type_orig','target']].set_index('type_orig').to_dict()['target']
# pickle.dump(label_map, open(os.path.join(#save_path, 'label_map.p'), 'wb'))

# test_set = df_raw.sample(10000, random_state=10)
# def preprocess(X):
#   X = X.drop_duplicates()
#   X = X.dropna(subset=['tSubject', 'mMsgContent'], how='all')
#   X['type'] = X['QueryType'] + ' | ' + X['SubQueryType']
#   X['type'] = X['type'].fillna(' ')
#   X['type'] = X['type'].apply(clean_l1).str.lower().str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ').apply(clean_l1)
#   X = X[X['type'].isin(list(label_encoder.classes_))]
#   X['subj&msg'] = X['tSubject'] + ' sub_eos_token ' + X['mMsgContent']
#   X = X[['subj&msg','type']]
#   X.columns = ['text','target']
#   X = X.dropna()
#   return X
# test_set = preprocess(test_set)
# test_set.describe()

  ##### <----- freezed backup -----> #####

# def predict_fasttext(text):
#   text = clean_l1(text)
#   text = preprocess_fasttext(text)
#   preds = model_fasttext.predict(text, k=-1)
#   label_names = label_encoder.inverse_transform([int(x.split('__')[-1]) for x in preds[0]])
#   preds = [(x,y) for x,y in zip(label_names,preds[1])]
#   return preds

# def predict_fastbert(text):
#   text = clean_l1(text)
#   text = preprocess_fastbert(text)
#   preds = model_fastbert.predict(text)
#   preds = [(label_encoder.inverse_transform([int(x[0])])[0],x[1]) for x in preds]
#   return preds

# def predict_countvect(text):
#   text = clean_l1(text)
#   text = clean_l2(text)
#   text = preprocess_countvectorizer(text)
#   cosim = linear_kernel(vectorizerx.transform([text]), dtmx).flatten()
#   preds = [(target_categories[i],cosim[i]) for i in range(len(cosim))]
#   preds = [target_categories[x] for x in np.argsort(-cosim)[:20]]
#   return preds

# def predict_tfidf(text):
#   text = clean_l1(text)
#   text = clean_l2(text)
#   preds = model_tfidf.predict_proba([text])[0]
#   preds = [(label_encoder.inverse_transform([int(x)])[0],y) for x,y in zip(model_tfidf.classes_, preds)]
#   preds.sort(key = lambda x: x[1], reverse=True)  
#   return preds


# query = test_set.sample()
# print('Text: ',query.text.values[0])
# print('Actual Label: ',query.target.values[0])
# print('Predicted Labels: ')
# pred1 = predict_fasttext(query.text.values[0])
# pred2 = predict_fastbert(query.text.values[0])
# pred3 = predict_countvect(query.text.values[0])
# pred4 = predict_tfidf(query.text.values[0])

# lmr = {label_map[v]:v for v in label_map.keys()}

def predict_fasttext(text):
  text = clean_l1(text)
  text = preprocess_fasttext(text)
  preds = model_fasttext.predict(text, k=-1)
  preds = [int(x.split('__')[-1]) for x in preds[0]]
  preds = pd.DataFrame([(x,(1/(i+1))) for i,x in enumerate(preds)],
                       columns=['label','rank_fasttext']).set_index('label')
  return preds

def predict_fastbert(text):
  text = clean_l1(text)
  text = preprocess_fastbert(text)
  preds = model_fastbert.predict(text)
  preds = pd.DataFrame([(int(x[0]),(1/(i+1))) for i,x in enumerate(preds)],
                      columns=['label','rank_fastbert']).set_index('label')
  return preds

def predict_tfidf(text):
  text = clean_l1(text)
  text = clean_l2(text)
  preds = model_tfidf.predict_proba([text])[0]
  preds = [(int(x),y) for x,y in zip(model_tfidf.classes_, preds)]
  preds.sort(key = lambda x: x[1], reverse=True)
  preds = pd.DataFrame([(int(x[0]),(1/(i+1))) for i,x in enumerate(preds)],
                    columns=['label','rank_tfidf']).set_index('label')
  return preds

def predict_countvect(text):
  text = clean_l1(text)
  text = clean_l2(text)
  text = preprocess_countvectorizer(text)
  cosim = linear_kernel(vectorizerx.transform([text]), dtmx).flatten()
  preds = [(int(target_categories[i]),cosim[i]) for i in range(len(cosim))]
  preds.sort(key = lambda x: x[1], reverse=True)
  preds = pd.DataFrame([(int(x[0]),(1/(i+1))) for i,x in enumerate(preds)],
                  columns=['label','rank_cvt']).set_index('label')
  return preds

model_weight = {'fasttext':10, 'fastbert':5, 'tfidf':3, 'cvt':2}

def predict(text):
  pred = predict_fasttext(text)
  pred = pred.join(predict_fastbert(text), on='label')
  pred = pred.join(predict_tfidf(text), on='label')
  pred = pred.join(predict_countvect(text), on='label')
  pred['score'] = (pred['rank_fasttext']*model_weight['fasttext']) + \
  (pred['rank_fastbert']*model_weight['fastbert']) + \
  (pred['rank_tfidf']*model_weight['tfidf']) + \
  (pred['rank_cvt']*model_weight['cvt'])
  pred = pred.sort_values(by='score', ascending=False)
  return pred

def predict(text):
  pred = predict_fasttext(text)
  pred = pred.join(predict_tfidf(text), on='label')
  pred = pred.join(predict_countvect(text), on='label')
  pred['score'] = (pred['rank_fasttext']*model_weight['fasttext']) + \
  (pred['rank_tfidf']*model_weight['tfidf']) + \
  (pred['rank_cvt']*model_weight['cvt'])
  pred = pred.sort_values(by='score', ascending=False)
  return pred

testx = pd.read_excel(os.path.join(save_path,'TSD_v1.xlsx'), sheet_name='Database records')
testx.head()

print(testx.info())
X = testx[(testx['OriginalQuery']+ ' | '+testx['OriginalSubQuery']).isin(list(set(label_map.keys())))]
print(X.info())
X.head()

X['text'] = X['tSubject'] + ' sub_eos_token ' + X['mMsgContent']
X['label'] = X['OriginalQuery'] + ' | '+ X['OriginalSubQuery']
X['plabel'] = X['label'].apply(lambda x: label_map[x])
X['clabel'] = label_encoder.transform(X.plabel)
X = X[['text','clabel']]
print(X.info())
X = X.dropna().drop_duplicates()
print(X.info())
X.head()

from tqdm import tqdm
tqdm.pandas()
top1 = top2 = top3 = 0
for index, row in tqdm(X.iterrows(), total=X.shape[0]):
  text = row.text
  label = row.clabel
  preds = predict(text).index.tolist()[:3]
  if label==preds[0]:
    top1+=1
  elif label==preds[1]:
    top2+=1
  elif label==preds[2]:
    top3+=1
print(top1, top2, top3)

100%|██████████| 1800/1800 [30:07<00:00,  1.00s/it]

1131 258 93

top1p = top1/X.shape[0]
top2p = top1p + top2/X.shape[0]
top3p = top2p + top3/X.shape[0]

print(top1p, top2p, top3p)

0.6344444444444445 0.7555555555555555 0.8061111111111111

query = test_set.sample()
# print('Text: ',query.text.values[0])
print('Actual Label: ',query.target.values[0])
print('Actual Label: ',label_encoder.transform([query.target.values[0]]))
predict(query.text.values[0]).head()

Actual Label:  drf requisition status
Actual Label:  [80]

test_set['target_cat'] = label_encoder.transform(test_set.target)
test_set.head()

from tqdm import tqdm
tqdm.pandas()
top1 = top2 = top3 = 0
for index, row in tqdm(test_set.iterrows(), total=test_set.shape[0]):
  text = row.text
  label = row.target_cat
  preds = predict(text).index.tolist()[:3]
  if label==preds[0]:
    top1+=1
  elif label==preds[1]:
    top2+=1
  elif label==preds[2]:
    top3+=1
print(top1, top2, top3)

100%|██████████| 7183/7183 [2:00:36<00:00,  1.01s/it]

5889 664 151

top1p = top1/test_set.shape[0]
top2p = top1p + top2/test_set.shape[0]
top3p = top2p + top3/test_set.shape[0]

print(top1p, top2p, top3p)

0.8198524293470695 0.9122929138243074 0.9333147709870528

def func(subj, msg):
  text = str(subj) + ' sub_eos_token ' + str(msg)
  preds = predict(text).head(3)
  preds.index = label_encoder.inverse_transform(preds.index.tolist())
  preds = preds.rename_axis('label').reset_index()[['label','score']]
  preds.label = preds.label.apply(lambda x: label_map[x])
  preds = preds.T.to_dict()
  preds = str(preds)
  return preds

x = test_set.sample()
x

x.text.tolist()[0]

subj = '<redacted>'
msg = '<redacted>'

xx = func(subj, msg)
xx

xx = func(subj, msg)
xx

pd.Series(test_set['type_orig'].unique()).to_csv('label_list.csv', index=False)

!pip install -q gradio
import gradio as gr

gr.Interface(func, 
             [
              gr.inputs.Textbox(lines=2, label='Email Subject'),
              gr.inputs.Textbox(lines=10, label='Email Body'),
             ],
             gr.outputs.Textbox(label='Output Labels')).launch();

xx = pd.DataFrame(label_map, index=[0]).T.reset_index()
xx.columns = ['plabel', 'olabel']
xx.head()

xx.olabel.value_counts()

Email Classification

Fetching data from MS-Sql​

Wrangling​

Text Cleaning​

Transformer model​

TFIDF model​

FastText model​

Pipeline​

Fetching data from MS-Sql

Wrangling

Text Cleaning

Transformer model

TFIDF model

FastText model

Pipeline