Source code for sentence_handler

from typing import List

import networkx
import numpy
import pandas
from networkx import PowerIterationFailedConvergence
from sklearn.preprocessing import LabelEncoder


[docs]def sentence_pairing(sentences: List[str]) -> pandas.DataFrame:
    """
    Create a matrix of paired sentences, where same sentences are omitted.

    :param sentences: list of sentences
    :return: DataFrame with the columns ["sent_1", "sent_2"] where each row is a paired sentences.
    """
    sent_pairs = []
    for i in range(len(sentences)):
        for j in range(i, len(sentences)):
            if sentences[i] == sentences[j]:
                continue
            sent_pairs.append([sentences[i], sentences[j]])
    return pandas.DataFrame(sent_pairs, columns=["sent_1", "sent_2"])


[docs]def sentence_rank_with_page_rank(sentence_pairs_with_score: pandas.DataFrame) -> pandas.DataFrame:
    """
    Rank the sentences based on their similarity score to each other using page-rank algorithm and output their
    new scores.

    :param sentence_pairs_with_score: DataFrame with the columns ["sent_1", "sent_2", "score"] where each row is a paired sentences with their initial similarity score.
    :return: DataFrame with the columns ["sentence", "rank"] where each sentence has its rank.
    """
    sentences = set()
    sentences.update(sentence_pairs_with_score["sent_1"].tolist())
    sentences.update(sentence_pairs_with_score["sent_2"].tolist())
    sentences_list = list()
    sentences_list.extend(sentences)
    le = LabelEncoder()
    le.fit(sentences_list)
    similarity_matrix = numpy.zeros((len(sentences_list), len(sentences_list)))
    for idx1 in range(len(sentences_list)):
        for idx2 in range(len(sentences_list)):
            if idx1 == idx2:
                # ignore if both are same sentences
                continue
            first_sent = sentences_list[idx1]
            second_sent = sentences_list[idx2]
            df = sentence_pairs_with_score[
                (sentence_pairs_with_score["sent_1"] == first_sent) & (
                        sentence_pairs_with_score["sent_2"] == second_sent)]
            if df.shape[0] == 0:
                df = sentence_pairs_with_score[
                    (sentence_pairs_with_score["sent_1"] == second_sent) & (sentence_pairs_with_score[
                                                                                "sent_2"] == first_sent)]

            similarity_matrix[le.transform([first_sent])[0]][le.transform([second_sent])[0]] = df["score"].iloc[0]
    sentence_similarity_graph = networkx.from_numpy_array(similarity_matrix)
    try:
        scores = networkx.pagerank(sentence_similarity_graph, max_iter=10_000)
    except PowerIterationFailedConvergence:
        scores = networkx.pagerank(sentence_similarity_graph, tol=1)
    result = pandas.DataFrame()
    result["sentence"] = scores.keys()
    result["sentence"] = le.inverse_transform(result["sentence"])
    result["rank"] = scores.values()
    return result


[docs]def sentence_sorter(df: pandas.DataFrame, top_n: int, sentences: List[str]) -> str:
    """
    Sort the sentences based on their rank and return the full summarized text in which the sentences appear as they are
    in the given text.

    :param df: DataFrame with the columns ["sentence", "rank"] where each sentence has its rank.
    :param top_n: total number of sentences in the summarized text.
    :param sentences: the given text tokenized into sentences.
    :return: the summarized text
    """
    sorted_df = df.sort_values(by=["rank"], ascending=False)
    selected_sentences = sorted_df.head(top_n)["sentence"].tolist()
    result = [sentence for sentence in sentences if sentence in selected_sentences]
    return " ".join(result)
Source code for sentence_handler

text-summarization

Navigation

Related Topics