from collections import defaultdict
import numpy as np
def add_one_smoothing(ngrams_counts, vocabulary_size):
smoothed_probs = defaultdict(lambda: 1.0/vocabulary_size) # Initialize probabilities with Laplace smoothing
total_counts = sum(ngrams_counts.values()) # Total count of all N-grams
for ngram, count in ngrams_counts.items():
smoothed_probs[ngram] = (count + 1) / (total_counts + vocabulary_size) # Add-one smoothing formula
return smoothed_probs
def generate_ngrams(text, N):
words = text.split()
ngrams_list = [tuple(words[i:i+N]) for i in range(len(words)-N+1)]
return ngrams_list
# Sample text
text = "Natural language processing is a fascinating field."
# Define the value of N for N-grams
N = 2
# Generate N-grams
ngrams_list = generate_ngrams(text, N)
# Count occurrences of N-grams
ngrams_counts = defaultdict(int)
for ngram in ngrams_list:
ngrams_counts[ngram] += 1
# Vocabulary size (number of unique words)
vocabulary_size = len(set(text.split()))
# Apply add-one smoothing
smoothed_probs = add_one_smoothing(ngrams_counts, vocabulary_size)
# Display smoothed probabilities for N-grams
for ngram, prob in smoothed_probs.items():
print(f"N-gram: {ngram}, Probability: {prob}")
0 Comments