Source code for sentence_handler
from typing import List
import networkx
import numpy
import pandas
from networkx import PowerIterationFailedConvergence
from sklearn.preprocessing import LabelEncoder
[docs]def sentence_pairing(sentences: List[str]) -> pandas.DataFrame:
"""
Create a matrix of paired sentences, where same sentences are omitted.
:param sentences: list of sentences
:return: DataFrame with the columns ["sent_1", "sent_2"] where each row is a paired sentences.
"""
sent_pairs = []
for i in range(len(sentences)):
for j in range(i, len(sentences)):
if sentences[i] == sentences[j]:
continue
sent_pairs.append([sentences[i], sentences[j]])
return pandas.DataFrame(sent_pairs, columns=["sent_1", "sent_2"])
[docs]def sentence_rank_with_page_rank(sentence_pairs_with_score: pandas.DataFrame) -> pandas.DataFrame:
"""
Rank the sentences based on their similarity score to each other using page-rank algorithm and output their
new scores.
:param sentence_pairs_with_score: DataFrame with the columns ["sent_1", "sent_2", "score"] where each row is a paired sentences with their initial similarity score.
:return: DataFrame with the columns ["sentence", "rank"] where each sentence has its rank.
"""
sentences = set()
sentences.update(sentence_pairs_with_score["sent_1"].tolist())
sentences.update(sentence_pairs_with_score["sent_2"].tolist())
sentences_list = list()
sentences_list.extend(sentences)
le = LabelEncoder()
le.fit(sentences_list)
similarity_matrix = numpy.zeros((len(sentences_list), len(sentences_list)))
for idx1 in range(len(sentences_list)):
for idx2 in range(len(sentences_list)):
if idx1 == idx2:
# ignore if both are same sentences
continue
first_sent = sentences_list[idx1]
second_sent = sentences_list[idx2]
df = sentence_pairs_with_score[
(sentence_pairs_with_score["sent_1"] == first_sent) & (
sentence_pairs_with_score["sent_2"] == second_sent)]
if df.shape[0] == 0:
df = sentence_pairs_with_score[
(sentence_pairs_with_score["sent_1"] == second_sent) & (sentence_pairs_with_score[
"sent_2"] == first_sent)]
similarity_matrix[le.transform([first_sent])[0]][le.transform([second_sent])[0]] = df["score"].iloc[0]
sentence_similarity_graph = networkx.from_numpy_array(similarity_matrix)
try:
scores = networkx.pagerank(sentence_similarity_graph, max_iter=10_000)
except PowerIterationFailedConvergence:
scores = networkx.pagerank(sentence_similarity_graph, tol=1)
result = pandas.DataFrame()
result["sentence"] = scores.keys()
result["sentence"] = le.inverse_transform(result["sentence"])
result["rank"] = scores.values()
return result
[docs]def sentence_sorter(df: pandas.DataFrame, top_n: int, sentences: List[str]) -> str:
"""
Sort the sentences based on their rank and return the full summarized text in which the sentences appear as they are
in the given text.
:param df: DataFrame with the columns ["sentence", "rank"] where each sentence has its rank.
:param top_n: total number of sentences in the summarized text.
:param sentences: the given text tokenized into sentences.
:return: the summarized text
"""
sorted_df = df.sort_values(by=["rank"], ascending=False)
selected_sentences = sorted_df.head(top_n)["sentence"].tolist()
result = [sentence for sentence in sentences if sentence in selected_sentences]
return " ".join(result)