Source code for src.preprocessing.keyword_extraction

from collections import defaultdict
from typing import List

import nltk
from keybert import KeyBERT
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from src.utils.dbconnector import append_to_document, find_documents
from src.utils.logger import setup_logger

nltk.download("punkt")
nltk.download("stopwords")

# Setup logger
logger = setup_logger()


# Not in use
[docs] def preprocess_text(text): """ Preprocesses a given text by tokenizing it and removing stopwords. Args: text (str): The text to preprocess. Returns: List[str]: A list of words without stopwords. """ logger.info("Preprocessing text for tokenization and stopword removal.") stop_words = set(stopwords.words("english")) try: words = word_tokenize(text) except Exception as e: logger.error("Error during tokenization: %s", e) return [] filtered_words = [ word for word in words if word.isalnum() and word.lower() not in stop_words ] logger.info("Text preprocessed successfully.") return filtered_words
# Not in use
[docs] def bert_keyword_extraction(texts: List[str], top_n: int = 10) -> List[str]: """ Extracts keywords from a list of texts using KeyBERT. Args: texts (List[str]): List of texts to extract keywords from. top_n (int): Number of top keywords to extract per text. Returns: List[str]: List of unique extracted keywords. """ logger.info("Starting keyword extraction using KeyBERT.") model = KeyBERT("all-MiniLM-L6-v2") all_keywords = [] for text in texts: keywords = model.extract_keywords( text, keyphrase_ngram_range=(1, 2), top_n=top_n ) all_keywords.extend([kw[0] for kw in keywords]) logger.info("KeyBERT keyword extraction completed successfully.") return list(set(all_keywords)) # Return unique keywords
[docs] def extract_keywords(article_ids, top_n: int = 10): """ Extracts keywords from a list of texts using KeyBERT. Args: texts (List[str]): List of texts to extract keywords from. top_n (int): Number of top keywords to extract per text. Returns: It returns something else not a list of list of str. List[List[str]]: List of keyword lists for each text. """ article_summaries = [] documents = find_documents("News_Articles", {"id": {"$in": article_ids}}) for doc in documents: article_summaries.append({"id": doc["id"], "summary": doc["summary"]}) logger.info("Initializing KeyBERT model for keyword extraction.") model = KeyBERT("all-MiniLM-L6-v2") article_keywords = [] logger.info(f"Extracting keywords from {len(article_summaries)} texts.") for idx, obj in enumerate(article_summaries): logger.debug( f"Extracting keywords from text {idx+1}/{len(article_summaries)}.") try: keywords = model.extract_keywords( obj.get("summary"), keyphrase_ngram_range=(1, 2), stop_words="english", top_n=top_n, ) extracted_keywords = [kw[0] for kw in keywords] keyword_obj = {"id": obj.get("id"), "keywords": extracted_keywords} article_keywords.append(keyword_obj) append_to_document("News_Articles", { "id": obj.get("id")}, keyword_obj) logger.debug(f"Keywords for text {idx+1}: {extracted_keywords}") except Exception as e: logger.error(f"Error extracting keywords from text {idx+1}: {e}") article_keywords.append([]) logger.info("Keyword extraction completed.") # -------- # MongoDB code to store article keywords # -------- return article_keywords
# def aggregate_keywords(texts, top_n=10): # logger.info("Aggregating keywords across all articles.") # keywords = extract_keywords(texts, top_n) # logger.info(f"Top {top_n} aggregated keywords: {keywords}") # return keywords