Module implicit_word_network.parse_documents
Expand source code
# ---------------------------------------------------------------------------- #
# Import and Parse Corpus #
# ---------------------------------------------------------------------------- #
from tqdm import tqdm
def parseDocument(d_id=0, d="", entity_types=[], nlp=None):
"""
Parse a document. Document is sentencized and tokenized.
Also, Entities are marked.
"""
d_parsed = []
# SpaCy token object: https://spacy.io/api/token
nlp_d = nlp(d)
S = nlp_d.sents
# Iterate over sentences
for s_id, s in enumerate(S):
# Iterate over tokens
for t_id, t in enumerate(s):
# Check if punctuation
is_punct = t.pos_ == "PUNCT"
is_entity = t.ent_iob_ != "O" and (
(t.ent_type_ in entity_types) or (len(entity_types) == 0)
)
t_parsed = {
"d_id": d_id,
"s_id": s_id,
"t_id": t_id,
"text": t.text,
"pos": t.pos_,
"is_stopword": t.is_stop,
"is_punctuation": is_punct,
"is_entity": is_entity,
"entity_type": t.ent_type_,
}
d_parsed.append(t_parsed)
return d_parsed
def parseDocuments(D=[], entity_types=[], show_progress=True, nlp=None):
"""
Parse a list of documents.
Documents are sentencized and tokenized.
Also, Entities are marked.
"""
D_parsed = []
# Iterate over documents
for d_id, d in enumerate(tqdm(D, desc="Documents", disable=(not show_progress))):
d_parsed = parseDocument(d_id, d, entity_types, nlp)
D_parsed = D_parsed + d_parsed
return D_parsed
def createCorpMat(D=[], remove_stopwords=True, show_progress=True):
"""
Convert parsing results from a flat list of tokens into a nested dictionary.
"""
D_mat = {}
d_id_prev = -1
s_id_prev = -1
# iterate over all tokens in flat list
for t in tqdm(D, desc="Tokens", disable=(not show_progress)):
d_id = t["d_id"]
s_id = t["s_id"]
t_id = t["t_id"]
# init emtpy document dict
# if not present already
if d_id_prev != d_id:
if d_id not in D_mat:
D_mat[d_id] = {}
# init emtpy sentence dict
# if not present already
if s_id_prev != s_id:
if s_id not in D_mat[d_id]:
D_mat[d_id][s_id] = {}
# store token in mat
if (not t["is_stopword"] and not t["is_punctuation"]) or not remove_stopwords:
D_mat[d_id][s_id][t_id] = t
return D_mat
Functions
def createCorpMat(D=[], remove_stopwords=True, show_progress=True)
-
Convert parsing results from a flat list of tokens into a nested dictionary.
Expand source code
def createCorpMat(D=[], remove_stopwords=True, show_progress=True): """ Convert parsing results from a flat list of tokens into a nested dictionary. """ D_mat = {} d_id_prev = -1 s_id_prev = -1 # iterate over all tokens in flat list for t in tqdm(D, desc="Tokens", disable=(not show_progress)): d_id = t["d_id"] s_id = t["s_id"] t_id = t["t_id"] # init emtpy document dict # if not present already if d_id_prev != d_id: if d_id not in D_mat: D_mat[d_id] = {} # init emtpy sentence dict # if not present already if s_id_prev != s_id: if s_id not in D_mat[d_id]: D_mat[d_id][s_id] = {} # store token in mat if (not t["is_stopword"] and not t["is_punctuation"]) or not remove_stopwords: D_mat[d_id][s_id][t_id] = t return D_mat
def parseDocument(d_id=0, d='', entity_types=[], nlp=None)
-
Parse a document. Document is sentencized and tokenized. Also, Entities are marked.
Expand source code
def parseDocument(d_id=0, d="", entity_types=[], nlp=None): """ Parse a document. Document is sentencized and tokenized. Also, Entities are marked. """ d_parsed = [] # SpaCy token object: https://spacy.io/api/token nlp_d = nlp(d) S = nlp_d.sents # Iterate over sentences for s_id, s in enumerate(S): # Iterate over tokens for t_id, t in enumerate(s): # Check if punctuation is_punct = t.pos_ == "PUNCT" is_entity = t.ent_iob_ != "O" and ( (t.ent_type_ in entity_types) or (len(entity_types) == 0) ) t_parsed = { "d_id": d_id, "s_id": s_id, "t_id": t_id, "text": t.text, "pos": t.pos_, "is_stopword": t.is_stop, "is_punctuation": is_punct, "is_entity": is_entity, "entity_type": t.ent_type_, } d_parsed.append(t_parsed) return d_parsed
def parseDocuments(D=[], entity_types=[], show_progress=True, nlp=None)
-
Parse a list of documents. Documents are sentencized and tokenized. Also, Entities are marked.
Expand source code
def parseDocuments(D=[], entity_types=[], show_progress=True, nlp=None): """ Parse a list of documents. Documents are sentencized and tokenized. Also, Entities are marked. """ D_parsed = [] # Iterate over documents for d_id, d in enumerate(tqdm(D, desc="Documents", disable=(not show_progress))): d_parsed = parseDocument(d_id, d, entity_types, nlp) D_parsed = D_parsed + d_parsed return D_parsed