#!/usr/bin/env python3
"""
TEST 2: RARE BIGRAM ANALYSIS
Only counts bigrams that appear in ≤5 plays total (rare phrases).
This filters out common phrases like "the king" and focuses on 
distinctive vocabulary.

Also compares Henry VIII Shakespeare vs Fletcher divisions.

Output: Rankings and shared rare bigram details.
"""

import sqlite3
import re
from collections import Counter, defaultdict
import csv

# Paths - adjust if needed
DB_PATH = 'early_modern_plays.db'
CONFESSION_XML = 'Neville_Confession_1600.xml'
OUTPUT_FILE = 'results_rare_bigrams.csv'

def has_alpha(token):
    return re.search(r'[A-Za-z]', token) is not None

def extract_xml_lemmas(path):
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        xml = f.read()
    pattern = r'lemma=["\']([^"\']+)["\']'
    lemma_matches = re.findall(pattern, xml, flags=re.IGNORECASE)
    return [l.lower().strip() for l in lemma_matches if l and has_alpha(l)]

def build_bigrams(tokens):
    counts = Counter()
    for i in range(len(tokens) - 1):
        bigram = (tokens[i], tokens[i+1])
        counts[bigram] += 1
    return counts

def main():
    print("=" * 70)
    print("TEST 2: RARE BIGRAM ANALYSIS")
    print("(Only bigrams appearing in ≤5 plays)")
    print("=" * 70)
    
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    # Extract Confession
    print("\n1. Extracting Confession bigrams...")
    confession_lemmas = extract_xml_lemmas(CONFESSION_XML)
    confession_bigrams = build_bigrams(confession_lemmas)
    print(f"   Tokens: {len(confession_lemmas)}")
    print(f"   Unique bigrams: {len(confession_bigrams)}")
    
    # Get all plays
    cursor.execute("""
        SELECT PLAY_ID, TITLE, CREATION_YEAR 
        FROM plays 
        WHERE CREATION_YEAR BETWEEN 1590 AND 1615
        ORDER BY CREATION_YEAR
    """)
    plays = cursor.fetchall()
    print(f"\n2. Loading {len(plays)} plays...")
    
    # Load all play tokens and build bigrams
    play_tokens = defaultdict(list)
    cursor.execute("SELECT PLAY_ID, A0 FROM words WHERE A0 IS NOT NULL ORDER BY PLAY_ID, WORD_ID")
    for pid, word in cursor:
        if word and has_alpha(word):
            play_tokens[pid].append(word.lower())
    
    # Build bigrams for each play
    play_bigrams = {}
    bigram_play_count = Counter()  # How many plays contain each bigram
    
    for pid, title, year in plays:
        tokens = play_tokens.get(pid, [])
        if len(tokens) >= 1000:
            bigrams = build_bigrams(tokens)
            play_bigrams[pid] = bigrams
            for bg in bigrams.keys():
                bigram_play_count[bg] += 1
    
    print(f"   Total unique bigrams in corpus: {len(bigram_play_count)}")
    
    # Find rare bigrams (≤5 plays)
    rare_bigrams = {bg for bg, count in bigram_play_count.items() if count <= 5}
    print(f"   Rare bigrams (≤5 plays): {len(rare_bigrams)}")
    
    # Filter confession bigrams to rare only
    confession_rare = {bg: count for bg, count in confession_bigrams.items() 
                       if bg in rare_bigrams}
    print(f"   Confession rare bigrams: {len(confession_rare)}")
    
    # Count rare bigram matches with each play
    print("\n3. Counting rare bigram matches with each play...")
    
    results = []
    for pid, title, year in plays:
        if pid not in play_bigrams:
            continue
        
        play_rare = {bg for bg in play_bigrams[pid].keys() if bg in rare_bigrams}
        
        # Shared rare bigrams
        shared = set(confession_rare.keys()) & play_rare
        
        results.append({
            'PLAY_ID': pid,
            'Title': title,
            'Year': year,
            'Shared_Rare_Bigrams': len(shared),
            'Play_Rare_Bigrams': len(play_rare),
        })
    
    # Sort by shared count
    results.sort(key=lambda r: r['Shared_Rare_Bigrams'], reverse=True)
    
    # Add rank
    for i, r in enumerate(results, 1):
        r['Rank'] = i
    
    # Print top 40
    print("\n" + "=" * 70)
    print("TOP 40 PLAYS: SHARED RARE BIGRAMS WITH CONFESSION")
    print("=" * 70)
    print(f"{'Rank':<5} {'Year':<6} {'Shared':<8} {'Title'}")
    print("-" * 70)
    
    for r in results[:40]:
        print(f"{r['Rank']:<5} {r['Year']:<6} {r['Shared_Rare_Bigrams']:<8} {r['Title'][:50]}")
    
    # Save results
    with open(OUTPUT_FILE, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['Rank', 'PLAY_ID', 'Title', 'Year', 
                                                'Shared_Rare_Bigrams', 'Play_Rare_Bigrams'])
        writer.writeheader()
        for r in results:
            writer.writerow(r)
    print(f"\nResults saved to {OUTPUT_FILE}")
    
    # ========================================
    # HENRY VIII DIVISION ANALYSIS
    # ========================================
    print("\n" + "=" * 70)
    print("HENRY VIII: SHAKESPEARE vs FLETCHER DIVISIONS")
    print("=" * 70)
    
    # Get Shakespeare division (54) and Fletcher division (109)
    cursor.execute('SELECT A0 FROM words WHERE DIVISION_ID = 54 AND A0 IS NOT NULL ORDER BY WORD_ID')
    shakes_words = [row[0].lower() for row in cursor.fetchall() if has_alpha(row[0])]
    
    cursor.execute('SELECT A0 FROM words WHERE DIVISION_ID = 109 AND A0 IS NOT NULL ORDER BY WORD_ID')
    fletcher_words = [row[0].lower() for row in cursor.fetchall() if has_alpha(row[0])]
    
    print(f"\nShakespeare division: {len(shakes_words)} words")
    print(f"Fletcher division: {len(fletcher_words)} words")
    
    # Build bigrams
    shakes_bg = build_bigrams(shakes_words)
    fletcher_bg = build_bigrams(fletcher_words)
    
    shakes_rare = {bg for bg in shakes_bg.keys() if bg in rare_bigrams}
    fletcher_rare = {bg for bg in fletcher_bg.keys() if bg in rare_bigrams}
    
    shared_shakes = set(confession_rare.keys()) & shakes_rare
    shared_fletcher = set(confession_rare.keys()) & fletcher_rare
    
    print(f"\nShakespeare division: {len(shared_shakes)} shared rare bigrams")
    print(f"Fletcher division:    {len(shared_fletcher)} shared rare bigrams")
    
    print("\nSHAKESPEARE DIVISION - Shared rare bigrams:")
    print("-" * 50)
    for bg in sorted(shared_shakes, key=lambda x: bigram_play_count[x]):
        print(f"  '{' '.join(bg)}' - appears in {bigram_play_count[bg]} plays")
    
    print("\nFLETCHER DIVISION - Shared rare bigrams:")
    print("-" * 50)
    for bg in sorted(shared_fletcher, key=lambda x: bigram_play_count[x]):
        print(f"  '{' '.join(bg)}' - appears in {bigram_play_count[bg]} plays")
    
    # Hamlet comparison
    print("\n" + "=" * 70)
    print("HAMLET COMPARISON")
    print("=" * 70)
    
    hamlet_words = play_tokens.get(503, [])
    hamlet_bg = build_bigrams(hamlet_words)
    hamlet_rare = {bg for bg in hamlet_bg.keys() if bg in rare_bigrams}
    shared_hamlet = set(confession_rare.keys()) & hamlet_rare
    
    print(f"\nHamlet: {len(shared_hamlet)} shared rare bigrams")
    for bg in sorted(shared_hamlet, key=lambda x: bigram_play_count[x]):
        print(f"  '{' '.join(bg)}' - appears in {bigram_play_count[bg]} plays")
    
    conn.close()
    print("\n✓ Analysis complete!")


if __name__ == '__main__':
    main()