from collections import defaultdict

import numpy as np


def add_one_smoothing(ngrams_counts, vocabulary_size):

    smoothed_probs = defaultdict(lambda: 1.0/vocabulary_size)  # Initialize probabilities with Laplace smoothing

    total_counts = sum(ngrams_counts.values())  # Total count of all N-grams

    for ngram, count in ngrams_counts.items():

        smoothed_probs[ngram] = (count + 1) / (total_counts + vocabulary_size)  # Add-one smoothing formula

    return smoothed_probs


def generate_ngrams(text, N):

    words = text.split()

    ngrams_list = [tuple(words[i:i+N]) for i in range(len(words)-N+1)]

    return ngrams_list


# Sample text

text = "Natural language processing is a fascinating field."


# Define the value of N for N-grams

N = 2


# Generate N-grams

ngrams_list = generate_ngrams(text, N)


# Count occurrences of N-grams

ngrams_counts = defaultdict(int)

for ngram in ngrams_list:

    ngrams_counts[ngram] += 1


# Vocabulary size (number of unique words)

vocabulary_size = len(set(text.split()))


# Apply add-one smoothing

smoothed_probs = add_one_smoothing(ngrams_counts, vocabulary_size)


# Display smoothed probabilities for N-grams

for ngram, prob in smoothed_probs.items():

    print(f"N-gram: {ngram}, Probability: {prob}")


Post a Comment

0 Comments