# Install required packages
pip install psycopg2-binary pandas numpy matplotlib seaborn wordcloud python-dotenv

# PostgreSQL with Apache AGE extension must be running
# Default port: 5433

✅ Libraries imported successfully
📦 pandas version: 3.0.0
📦 numpy version: 2.4.2

# Install required packages
pip install psycopg2-binary pandas numpy matplotlib seaborn wordcloud python-dotenv

# PostgreSQL with Apache AGE extension must be running
# Default port: 5433

# Library imports
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import json
import random
from wordcloud import WordCloud
import warnings
from typing import Optional, Dict, Any

# Configure visualization defaults
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✅ Libraries imported successfully")
print(f"📦 pandas version: {pd.__version__}")
print(f"📦 numpy version: {np.__version__}")

✅ Libraries imported successfully
📦 pandas version: 3.0.0
📦 numpy version: 2.4.2

import os
from dotenv import load_dotenv

load_dotenv()
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'port': int(os.getenv('DB_PORT', 5433)),
    'database': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD')
}

# Collect entity counts
stats = {}

queries = [
    ('provisions', "MATCH (p:Provision) RETURN COUNT(p)"),
    ('requirements', "MATCH (r:Requirement) RETURN COUNT(r)"),
    ('constraints', "MATCH (c:Constraint) RETURN COUNT(c)"),
    ('relationships', "MATCH ()-[r]->() RETURN COUNT(r)")
]

print("🔄 Querying graph statistics...\n")

for name, cypher_query in queries:
    full_query = f"SELECT * FROM cypher('{GRAPH_NAME}', $$ {cypher_query} $$) as (count agtype);"
    result = execute_cypher(full_query)
    stats[name] = result.iloc[0, 0]
    print(f"  ✓ Counted {stats[name]:,} {name}")

# Create summary table
summary_df = pd.DataFrame([
    {'Entity': 'Provisions', 'Count': stats['provisions'], 'Description': 'Legal text fragments'},
    {'Entity': 'Requirements', 'Count': stats['requirements'], 'Description': 'Parsed obligations'},
    {'Entity': 'Constraints', 'Count': stats['constraints'], 'Description': 'Computable conditions'},
    {'Entity': 'Relationships', 'Count': stats['relationships'], 'Description': 'Graph edges'}
])

print("\n📊 GRAPH SUMMARY")
print("="*70)
print(summary_df.to_string(index=False))

# Calculate key metrics
total_nodes = stats['provisions'] + stats['requirements'] + stats['constraints']
reqs_per_prov = stats['requirements'] / stats['provisions']
consts_per_req = stats['constraints'] / stats['requirements']
rels_per_node = stats['relationships'] / total_nodes

print(f"\n📈 DERIVED METRICS")
print("="*70)
print(f"Total Nodes:                 {total_nodes:,}")
print(f"Requirements per Provision:  {reqs_per_prov:.2f}")
print(f"Constraints per Requirement: {consts_per_req:.2f}")
print(f"Relationships per Node:      {rels_per_node:.2f}")

# Graph density (actual edges / possible edges)
max_edges = total_nodes * (total_nodes - 1)  # Directed graph
density = stats['relationships'] / max_edges
print(f"Graph Density:               {density:.6f} ({density*100:.4f}%)")

🔄 Querying graph statistics...

  ✓ Counted 51,397 provisions
  ✓ Counted 131,069 requirements
  ✓ Counted 132,202 constraints
  ✓ Counted 480,654 relationships

📊 GRAPH SUMMARY
======================================================================
       Entity  Count           Description
   Provisions  51397  Legal text fragments
 Requirements 131069    Parsed obligations
  Constraints 132202 Computable conditions
Relationships 480654           Graph edges

📈 DERIVED METRICS
======================================================================
Total Nodes:                 314,668
Requirements per Provision:  2.55
Constraints per Requirement: 1.01
Relationships per Node:      1.53
Graph Density:               0.000005 (0.0005%)

fig, ax = plt.subplots(1, 1, figsize=(10, 6))

entities = ['Provisions', 'Requirements', 'Constraints']
counts = [stats['provisions'], stats['requirements'], stats['constraints']]
colors = ['#3498db', '#e74c3c', '#2ecc71']

bars = ax.bar(entities, counts, color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.set_ylabel('Count', fontsize=12, fontweight='bold')
ax.set_title('Regulatory Graph - Entity Distribution', fontsize=14, fontweight='bold', pad=15)
ax.grid(axis='y', alpha=0.3)

# Add count labels on bars
for bar, count in zip(bars, counts):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

# Query provisions by jurisdiction and authority
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (p:Provision)
    RETURN p.jurisdiction, p.authority, COUNT(p) as prov_cnt
$$) as (jurisdiction agtype, authority agtype, prov_cnt agtype);
"""

juris_df = execute_cypher(query)
juris_df.columns = ['Jurisdiction', 'Authority', 'Count']
juris_df = juris_df.sort_values('Count', ascending=False)

print("🌍 TOP AUTHORITIES BY PROVISION COUNT")
print("="*70)
print(juris_df.head(10).to_string(index=False))

# Calculate coverage
known_count = juris_df[juris_df['Authority'] != 'UNKNOWN']['Count'].sum()
coverage = (known_count / stats['provisions']) * 100

print(f"\n📊 Metadata Coverage: {coverage:.1f}% ({known_count:,} / {stats['provisions']:,})")

🌍 TOP AUTHORITIES BY PROVISION COUNT
======================================================================
Jurisdiction                        Authority  Count
          CA                             CFIA  41983
          CA            Department of Justice   4668
          CA                    Health Canada   3031
          CA        World Health Organization   1031
          CA Canadian General Standards Board    684

📊 Metadata Coverage: 100.0% (51,397 / 51,397)

# Visualize top authorities
top_10 = juris_df.head(10)

fig, ax = plt.subplots(1, 1, figsize=(12, 8))

# Create readable labels
labels = [f"{row['Jurisdiction']} - {row['Authority'][:40]}" for _, row in top_10.iterrows()]

bars = ax.barh(range(len(top_10)), top_10['Count'], color='steelblue', alpha=0.7, edgecolor='black')
ax.set_yticks(range(len(top_10)))
ax.set_yticklabels(labels)
ax.set_xlabel('Number of Provisions', fontsize=12, fontweight='bold')
ax.set_title('Top 10 Regulatory Authorities', fontsize=14, fontweight='bold', pad=15)
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

# Add count labels
for i, (bar, count) in enumerate(zip(bars, top_10['Count'])):
    ax.text(count, i, f' {count:,}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

# Get requirement types
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (r:Requirement)
    RETURN r.requirement_type, COUNT(r)
$$) as (type agtype, count agtype);
"""

req_types_df = execute_cypher(query)
req_types_df.columns = ['Requirement Type', 'Count']
req_types_df = req_types_df.sort_values('Count', ascending=False)
req_types_df['Percentage'] = (req_types_df['Count'] / req_types_df['Count'].sum() * 100).round(2)

print("📋 TOP 15 REQUIREMENT TYPES")
print("="*70)
print(req_types_df.head(15)[['Requirement Type', 'Count', 'Percentage']].to_string(index=False))

print(f"\nTotal unique requirement types: {len(req_types_df)}")
print(f"Top 5 coverage: {req_types_df.head(5)['Percentage'].sum():.1f}%")

📋 TOP 15 REQUIREMENT TYPES
======================================================================
Requirement Type  Count  Percentage
         process  34057       25.98
   documentation  32771       25.00
     composition  21800       16.63
       labelling  18479       14.10
     prohibition  11846        9.04
         testing   7032        5.37
           claim   3912        2.98
        training    244        0.19
        sampling    136        0.10
       packaging    118        0.09
      compliance     48        0.04
      definition     39        0.03
         storage     34        0.03
    notification     23        0.02
          design     21        0.02

Total unique requirement types: 144
Top 5 coverage: 90.8%

# Visualize top requirement types
top_15 = req_types_df.head(15)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
bars = ax1.barh(range(len(top_15)), top_15['Count'], color='coral', alpha=0.7, edgecolor='black')
ax1.set_yticks(range(len(top_15)))
ax1.set_yticklabels(top_15['Requirement Type'])
ax1.set_xlabel('Count', fontsize=12, fontweight='bold')
ax1.set_title('Top 15 Requirement Types', fontsize=13, fontweight='bold')
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

for i, (bar, count, pct) in enumerate(zip(bars, top_15['Count'], top_15['Percentage'])):
    ax1.text(count, i, f' {count:,} ({pct}%)', va='center', fontsize=9)

# Pie chart for top 5 + Other
top_5 = req_types_df.head(5)
other_count = req_types_df.iloc[5:]['Count'].sum()

pie_data = list(top_5['Count']) + [other_count]
pie_labels = list(top_5['Requirement Type']) + ['Other']
colors = plt.cm.Set3(range(len(pie_data)))

ax2.pie(pie_data, labels=pie_labels, autopct='%1.1f%%', startangle=90, colors=colors)
ax2.set_title('Requirement Type Distribution (Top 5 + Other)', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

# Get deontic modalities
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (r:Requirement)
    RETURN r.deontic_modality, COUNT(r)
$$) as (modality agtype, count agtype);
"""

modality_df = execute_cypher(query)
modality_df.columns = ['Modality', 'Count']
modality_df = modality_df.sort_values('Count', ascending=False)
modality_df['Percentage'] = (modality_df['Count'] / modality_df['Count'].sum() * 100).round(2)

print("⚖️  DEONTIC MODALITY DISTRIBUTION")
print("="*70)
print(modality_df.head(16).to_string(index=False))

# Calculate aggregated categories
mandatory = modality_df[modality_df['Modality'].str.contains('must|shall', case=False, na=False)]['Count'].sum()
optional = modality_df[modality_df['Modality'].str.contains('may|can', case=False, na=False)]['Count'].sum()
recommended = modality_df[modality_df['Modality'].str.contains('should', case=False, na=False)]['Count'].sum()

total_reqs = modality_df['Count'].sum()

print(f"\n📊 Aggregated Legal Force:")
print(f"  • Mandatory (must/shall):     {mandatory:>8,} ({mandatory/total_reqs*100:.1f}%)")
print(f"  • Optional (may/can):         {optional:>8,} ({optional/total_reqs*100:.1f}%)")
print(f"  • Recommended (should):       {recommended:>8,} ({recommended/total_reqs*100:.1f}%)")

⚖️  DEONTIC MODALITY DISTRIBUTION
======================================================================
   Modality  Count  Percentage
       must 100755       76.87
   must_not  17928       13.68
        may   6517        4.97
     should   5583        4.26
 should_not    195        0.15
      shall     55        0.04
    may_not     11        0.01
       will      6        0.00
 acceptable      4        0.00
   expected      4        0.00
         is      3        0.00
recommended      3        0.00
        can      2        0.00
      could      1        0.00
  recommend      1        0.00
  shall_not      1        0.00

📊 Aggregated Legal Force:
  • Mandatory (must/shall):      118,739 (90.6%)
  • Optional (may/can):            6,530 (5.0%)
  • Recommended (should):          5,778 (4.4%)

# Visualize modality distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Top modalities bar chart
top_modalities = modality_df.head(8)

# Color by category
colors = ['#e74c3c' if 'not' in str(mod).lower() else 
          '#3498db' if 'must' in str(mod).lower() else 
          '#2ecc71' if 'may' in str(mod).lower() else
          '#f39c12' for mod in top_modalities['Modality']]

bars = ax1.bar(range(len(top_modalities)), top_modalities['Count'], 
               color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax1.set_xticks(range(len(top_modalities)))
ax1.set_xticklabels(top_modalities['Modality'], rotation=45, ha='right')
ax1.set_ylabel('Count', fontsize=12, fontweight='bold')
ax1.set_title('Top Deontic Modalities', fontsize=13, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

for bar, count in zip(bars, top_modalities['Count']):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}', ha='center', va='bottom', fontsize=9)

# Pie chart: Aggregated legal force
other = total_reqs - mandatory - optional - recommended

pie_data = [mandatory, optional, recommended, other]
pie_labels = ['Mandatory\n(must)', 'Optional\n(may/can)', 'Recommended\n(should)', 'Other']
pie_colors = ['#e74c3c', '#3498db', '#f39c12', '#95a5a6']

ax2.pie(pie_data, labels=pie_labels, autopct='%1.1f%%', startangle=90, colors=pie_colors)
ax2.set_title('Legal Force Distribution', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

# Get constraint types
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (c:Constraint)
    RETURN c.logic_type, COUNT(c)
$$) as (type agtype, count agtype);
"""

constraint_df = execute_cypher(query)
constraint_df.columns = ['Logic Type', 'Count']
constraint_df = constraint_df.sort_values('Count', ascending=False)
constraint_df['Percentage'] = (constraint_df['Count'] / constraint_df['Count'].sum() * 100).round(2)

print("🔧 CONSTRAINT LOGIC TYPES")
print("="*70)
print(constraint_df.to_string(index=False))

🔧 CONSTRAINT LOGIC TYPES
======================================================================
 Logic Type  Count  Percentage
    boolean  84569       63.97
  threshold  37422       28.31
    pattern   5747        4.35
enumeration   4464        3.38

# Visualize constraint types
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

colors = ['#9b59b6', '#e67e22', '#1abc9c', '#34495e']

# Bar chart
bars = ax1.bar(constraint_df['Logic Type'], constraint_df['Count'], 
               color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax1.set_ylabel('Count', fontsize=12, fontweight='bold')
ax1.set_title('Constraint Types Distribution', fontsize=13, fontweight='bold')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

for bar, count, pct in zip(bars, constraint_df['Count'], constraint_df['Percentage']):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}\n({pct}%)',
            ha='center', va='bottom', fontsize=9)

# Pie chart
ax2.pie(constraint_df['Count'], labels=constraint_df['Logic Type'], 
        autopct='%1.1f%%', startangle=90, colors=colors)
ax2.set_title('Constraint Type Proportions', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

# Sample random provisions
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (p:Provision)
    RETURN p.provision_id, p.authority, p.jurisdiction, p.text
    LIMIT 5000
$$) as (id agtype, authority agtype, jurisdiction agtype, text agtype);
"""

provisions_sample = execute_cypher(query)
provisions_sample.columns = ['Provision ID', 'Authority', 'Jurisdiction', 'Text']

# Select 5 random samples
random_provisions = provisions_sample.sample(n=min(5, len(provisions_sample)))

print("📄 RANDOM PROVISION SAMPLES")
print("="*80)

for i, (idx, row) in enumerate(random_provisions.iterrows(), 1):
    print(f"\n[{i}] Provision ID: {row['Provision ID']}")
    print(f"    Authority: {row['Authority']}")
    print(f"    Jurisdiction: {row['Jurisdiction']}")
    text_preview = row['Text'][:250] if len(row['Text']) > 250 else row['Text']
    print(f"    Text: {text_preview}...")
    print("-" * 80)

📄 RANDOM PROVISION SAMPLES
================================================================================

[1] Provision ID: UNK_GEN_NODOC_2545
    Authority: CFIA
    Jurisdiction: CA
    Text: Questions and answers Q1. What is the issue? Some beekeepers have been using veterinary drugs in an extra-label manner (using drug products for purposes other than indicated on the label) to control certain honey bee diseases for which either there i...
--------------------------------------------------------------------------------

[2] Provision ID: 1.6.3
    Authority: CFIA
    Jurisdiction: CA
    Text: days which are added to the drying period because of a sausage formulation which has a reduced salt content (see section A.1.6.3). = Minimum number of drying days...
--------------------------------------------------------------------------------

[3] Provision ID: FALLBACK_c1bbfcda
    Authority: CFIA
    Jurisdiction: CA
    Text: Table: Colour classes for prepackaged honey other than consumer prepackaged honey Item Colour class Designation on honey classifier Darker than Designation on honey classifier Not darker than Reading on Pfund honey grader More than Reading on Pfund h...
--------------------------------------------------------------------------------

[4] Provision ID: UNK_CONT_NODOC_502
    Authority: CFIA
    Jurisdiction: CA
    Text: ltry carcass when the selection is done after chilling, at the end of the drip line or last readily accessible point prior to packaging (e.g., carcasses which are subjected to immersion freezing) - see noteFootnote 3: that the selection of the chille...
--------------------------------------------------------------------------------

[5] Provision ID: 5.2.2
    Authority: CFIA
    Jurisdiction: CA
    Text: a) adjacent areas; b) equipment used for both organic and non-organic crops. If unintended contact with prohibited substances is possible, distinct buffer zones or other features sufficient 5.2.2 to prevent contamination are required:...
--------------------------------------------------------------------------------

# Query requirement-constraint pairs
query = f"""
SELECT
  req_type,
  modality,
  description,
  const_type,
  signal,
  (c_props -> '"operator"'::agtype)  AS op,
  (c_props -> '"threshold"'::agtype) AS thresh
FROM cypher('{GRAPH_NAME}', $$
  MATCH (r:Requirement)-[:HAS_CONSTRAINT]->(c:Constraint)
  RETURN
    r.requirement_type,
    r.deontic_modality,
    r.description,
    c.logic_type,
    c.target_signal,
    properties(c) AS c_props
  LIMIT 3000
$$) AS (
  req_type agtype,
  modality agtype,
  description agtype,
  const_type agtype,
  signal agtype,
  c_props agtype
);
"""

req_const_sample = execute_cypher(query)
req_const_sample.columns = [
    'Req Type', 'Modality', 'Description', 
    'Constraint Type', 'Target Signal', 'Operator', 'Threshold'
]

# Sample 5 random pairs
random_pairs = req_const_sample.sample(n=min(5, len(req_const_sample)))

print("🔗 RANDOM REQUIREMENT → CONSTRAINT PAIRS")
print("="*80)

for i, (idx, row) in enumerate(random_pairs.iterrows(), 1):
    print(f"\n[{i}] Requirement Type: {row['Req Type']}")
    print(f"    Modality: {row['Modality']}")
    desc_preview = row['Description'][:200] if len(str(row['Description'])) > 200 else row['Description']
    print(f"    Description: {desc_preview}...")
    print(f"    ↓ Constraint")
    print(f"    Logic Type: {row['Constraint Type']}")
    print(f"    Target: {row['Target Signal']}")
    
    if row['Operator'] and str(row['Operator']) not in ['nan', 'None']:
        print(f"    Operator: {row['Operator']}")
        if row['Threshold'] and str(row['Threshold']) not in ['nan', 'None']:
            print(f"    Threshold: {row['Threshold']}")
    
    print("-" * 80)

🔗 RANDOM REQUIREMENT → CONSTRAINT PAIRS
================================================================================

[1] Requirement Type: documentation
    Modality: must
    Description: CFIA reserves the right to carry out audits of those involved in the IREP....
    ↓ Constraint
    Logic Type: boolean
    Target: audit.participation
--------------------------------------------------------------------------------

[2] Requirement Type: testing
    Modality: must
    Description: Salmonella must be absent in 25 g....
    ↓ Constraint
    Logic Type: boolean
    Target: product.Salmonella
--------------------------------------------------------------------------------

[3] Requirement Type: documentation
    Modality: must
    Description: A Certificate of Inspection covering meat products must be provided....
    ↓ Constraint
    Logic Type: boolean
    Target: documentation.certification
--------------------------------------------------------------------------------

[4] Requirement Type: documentation
    Modality: must
    Description: The Veterinary health certificate must indicate that the product is 'Moisture-Infused Pork'....
    ↓ Constraint
    Logic Type: pattern
    Target: certificate.text
--------------------------------------------------------------------------------

[5] Requirement Type: documentation
    Modality: must
    Description: Document the name of each restricted drug lost and, if applicable, the brand name of the product or the name of the compound that contained it....
    ↓ Constraint
    Logic Type: pattern
    Target: documentation.drug_name
--------------------------------------------------------------------------------

# Get requirement descriptions
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (r:Requirement)
    RETURN r.description
    LIMIT 10000
$$) as (description agtype);
"""

req_desc = execute_cypher(query)

# Combine all descriptions
all_text = ' '.join([str(desc) for desc in req_desc['description'] 
                     if desc and str(desc) not in ['nan', 'None']])

# Define stop words (regulatory + common)
stopwords = set([
    'must', 'shall', 'may', 'should', 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that',
    'is', 'for', 'with', 'as', 'or', 'are', 'on', 'by', 'from', 'have', 'has', 'at',
    'this', 'which', 'not', 'been', 'an', 'it', 'will', 'their', 'all', 'can', 'any',
    'each', 'if', 'include', 'including', 'contain', 'contains', 'set'
])

# Generate word cloud
wordcloud = WordCloud(
    width=1200, 
    height=600, 
    background_color='white',
    stopwords=stopwords, 
    colormap='viridis',
    max_words=100,
    relative_scaling=0.5,
    min_font_size=10
).generate(all_text)

plt.figure(figsize=(14, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Terms in Requirements', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\n💬 Word cloud generated from 10,000 requirement descriptions")
print("   Stop words removed: modal verbs (must, shall, may, should) and common terms")

💬 Word cloud generated from 10,000 requirement descriptions
   Stop words removed: modal verbs (must, shall, may, should) and common terms

# Analyze connectivity
queries_connectivity = [
    (
        "Provisions with Requirements",
        f"""
        SELECT (cnt::text)::bigint AS cnt
        FROM cypher('{GRAPH_NAME}', $$
            MATCH (p:Provision)<-[:DERIVED_FROM]-(r:Requirement)
            RETURN COUNT(DISTINCT p) AS cnt
        $$) AS (cnt agtype);
        """
    ),
    (
        "Requirements with Constraints",
        f"""
        SELECT (cnt::text)::bigint AS cnt
        FROM cypher('{GRAPH_NAME}', $$
            MATCH (r:Requirement)-[:HAS_CONSTRAINT]->(c:Constraint)
            RETURN COUNT(DISTINCT r) AS cnt
        $$) AS (cnt agtype);
        """
    ),
    (
        "Isolated Provisions (no requirements)",
        f"""
        SELECT (cnt::text)::bigint AS cnt
        FROM cypher('{GRAPH_NAME}', $$
            MATCH (p:Provision)
            WHERE NOT EXISTS( (p)<-[:DERIVED_FROM]-() )
            RETURN COUNT(p) AS cnt
        $$) AS (cnt agtype);
        """
    )
]

connectivity_data = []
for name, query in queries_connectivity:
    result = execute_cypher(query)
    count = int(result.iloc[0, 0])
    connectivity_data.append({"Metric": name, "Count": count})

connectivity_df = pd.DataFrame(connectivity_data)

# Extract specific metrics
provs_with_reqs = connectivity_df.loc[connectivity_df['Metric'] == 'Provisions with Requirements', 'Count'].values[0]
reqs_with_consts = connectivity_df.loc[connectivity_df['Metric'] == 'Requirements with Constraints', 'Count'].values[0]
isolated_provs = connectivity_df.loc[connectivity_df['Metric'].str.contains('Isolated'), 'Count'].values[0]

print("🔗 GRAPH CONNECTIVITY METRICS")
print("="*70)
print(f"Provisions with requirements: {provs_with_reqs:,} / {stats['provisions']:,} "
      f"({provs_with_reqs / stats['provisions'] * 100:.1f}%)")
print(f"Requirements with constraints: {reqs_with_consts:,} / {stats['requirements']:,} "
      f"({reqs_with_consts / stats['requirements'] * 100:.1f}%)")
print(f"\nIsolated provisions (orphaned): {isolated_provs:,}")

if isolated_provs > 0:
    print(f"⚠️  Warning: {isolated_provs} provisions have no requirements")
else:
    print("✅ Perfect connectivity: All provisions have requirements")

# Display DataFrame
display(connectivity_df)

🔗 GRAPH CONNECTIVITY METRICS
======================================================================
Provisions with requirements: 51,397 / 51,397 (100.0%)
Requirements with constraints: 116,906 / 131,069 (89.2%)

Isolated provisions (orphaned): 0
✅ Perfect connectivity: All provisions have requirements

# Visualize connectivity
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

categories = ['Provisions\nwith Reqs', 'Requirements\nwith Constraints']
connected = [provs_with_reqs, reqs_with_consts]
total = [stats['provisions'], stats['requirements']]
percentages = [
    provs_with_reqs / stats['provisions'] * 100,
    reqs_with_consts / stats['requirements'] * 100
]

x = np.arange(len(categories))
width = 0.35

bars1 = ax.bar(x - width/2, connected, width, label='Connected', 
               color='#2ecc71', alpha=0.8, edgecolor='black')
bars2 = ax.bar(x + width/2, [t - c for t, c in zip(total, connected)], width,
               label='Not Connected', color='#e74c3c', alpha=0.8, edgecolor='black')

ax.set_ylabel('Count', fontsize=12, fontweight='bold')
ax.set_title('Graph Connectivity Analysis', fontsize=14, fontweight='bold', pad=15)
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend(loc='upper right')
ax.grid(axis='y', alpha=0.3)

# Add percentage labels on connected bars
for i, (bar, pct) in enumerate(zip(bars1, percentages)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height/2,
            f'{pct:.1f}%', ha='center', va='center', 
            fontsize=11, fontweight='bold', color='white')

plt.tight_layout()
plt.show()

# Count requirements per provision
query = f"""
SELECT * FROM cypher('{GRAPH_NAME}', $$
    MATCH (p:Provision)<-[:DERIVED_FROM]-(r:Requirement)
    WITH p, COUNT(r) as req_count
    RETURN req_count, COUNT(p)
$$) as (req_count agtype, prov_count agtype);
"""

req_depth = execute_cypher(query)
req_depth.columns = ['Requirements per Provision', 'Provision Count']
req_depth = req_depth.sort_values('Requirements per Provision')

print("📊 REQUIREMENTS PER PROVISION DISTRIBUTION")
print("="*70)
print(req_depth.head(20).to_string(index=False))

# Calculate statistics
total_provisions = req_depth['Provision Count'].sum()
mean_reqs = (req_depth['Requirements per Provision'] * req_depth['Provision Count']).sum() / total_provisions
median_bin = req_depth.iloc[len(req_depth)//2]['Requirements per Provision']

print(f"\n📈 Statistics:")
print(f"  Mean requirements per provision: {mean_reqs:.2f}")
print(f"  Median bin: {median_bin}")
print(f"  Max requirements from single provision: {req_depth['Requirements per Provision'].max()}")

📊 REQUIREMENTS PER PROVISION DISTRIBUTION
======================================================================
 Requirements per Provision  Provision Count
                          1             8580
                          2            11303
                          3             8478
                          4             6003
                          5             1690
                          6             5903
                          7              696
                          8             1247
                          9             2161
                         10              997
                         11               97
                         12             1506
                         13              142
                         14              181
                         15              770
                         16              143
                         17               27
                         18              493
                         19               42
                         20              126

📈 Statistics:
  Mean requirements per provision: 4.70
  Median bin: 28
  Max requirements from single provision: 84

# Visualize distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Line chart - detailed distribution
ax1.plot(req_depth['Requirements per Provision'], req_depth['Provision Count'],
         marker='o', linewidth=2, markersize=4, color='#3498db', alpha=0.7)
ax1.fill_between(req_depth['Requirements per Provision'], req_depth['Provision Count'], 
                 alpha=0.3, color='#3498db')
ax1.set_xlabel('Number of Requirements per Provision', fontsize=11, fontweight='bold')
ax1.set_ylabel('Number of Provisions', fontsize=11, fontweight='bold')
ax1.set_title('Requirements per Provision Distribution', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Bar chart - binned distribution
bins = [0, 2, 5, 10, 15, 20, 50]
bin_labels = ['1-2', '3-5', '6-10', '11-15', '16-20', '20+']
req_depth['Bin'] = pd.cut(req_depth['Requirements per Provision'], 
                           bins=bins, labels=bin_labels, include_lowest=True)
binned = req_depth.groupby('Bin', observed=True)['Provision Count'].sum().reset_index()

bars = ax2.bar(binned['Bin'].astype(str), binned['Provision Count'],
               color='#9b59b6', alpha=0.7, edgecolor='black', linewidth=1.5)
ax2.set_xlabel('Requirements per Provision (Binned)', fontsize=11, fontweight='bold')
ax2.set_ylabel('Number of Provisions', fontsize=11, fontweight='bold')
ax2.set_title('Binned Requirements Distribution', fontsize=13, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

for bar, count in zip(bars, binned['Provision Count']):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Create comprehensive summary
summary = f"""
{'='*80}
REGULATORY KNOWLEDGE GRAPH - SUMMARY REPORT
{'='*80}

📊 GRAPH SIZE
  • Total Provisions:        {stats['provisions']:>10,}
  • Total Requirements:      {stats['requirements']:>10,}
  • Total Constraints:       {stats['constraints']:>10,}
  • Total Relationships:     {stats['relationships']:>10,}
  • Graph Density:           {density:.6f} ({density*100:.4f}%)

🌍 COVERAGE
  • Primary Authority:       CFIA ({juris_df[juris_df['Authority'] == 'CFIA']['Count'].values[0]:,} provisions)
  • Total Authorities:       {len(juris_df)}
  • Primary Jurisdiction:    Canada (CA)
  • Metadata Coverage:       {coverage:.1f}%

📋 REQUIREMENT BREAKDOWN
  • Top Type:                {req_types_df.iloc[0]['Requirement Type']} ({req_types_df.iloc[0]['Count']:,})
  • Unique Types:            {len(req_types_df)}
  • Top 5 Coverage:          {req_types_df.head(5)['Percentage'].sum():.1f}%
  • Mandatory (must):        {mandatory:,} ({mandatory/stats['requirements']*100:.1f}%)
  • Optional (may):          {optional:,} ({optional/stats['requirements']*100:.1f}%)
  • Recommended (should):    {recommended:,} ({recommended/stats['requirements']*100:.1f}%)

🔧 CONSTRAINT TYPES
  • Boolean:                 {constraint_df[constraint_df['Logic Type'] == 'boolean']['Count'].values[0]:,} ({constraint_df[constraint_df['Logic Type'] == 'boolean']['Percentage'].values[0]:.1f}%)
  • Threshold:               {constraint_df[constraint_df['Logic Type'] == 'threshold']['Count'].values[0]:,} ({constraint_df[constraint_df['Logic Type'] == 'threshold']['Percentage'].values[0]:.1f}%)
  • Pattern:                 {constraint_df[constraint_df['Logic Type'] == 'pattern']['Count'].values[0]:,} ({constraint_df[constraint_df['Logic Type'] == 'pattern']['Percentage'].values[0]:.1f}%)
  • Enumeration:             {constraint_df[constraint_df['Logic Type'] == 'enumeration']['Count'].values[0]:,} ({constraint_df[constraint_df['Logic Type'] == 'enumeration']['Percentage'].values[0]:.1f}%)

📈 CONNECTIVITY
  • Connected Provisions:    {provs_with_reqs/stats['provisions']*100:.1f}%
  • Connected Requirements:  {reqs_with_consts/stats['requirements']*100:.1f}%
  • Isolated Provisions:     {isolated_provs:,}
  • Avg Reqs/Provision:      {mean_reqs:.2f}
  • Avg Constraints/Req:     {stats['constraints']/stats['requirements']:.2f}

✅ DATA QUALITY
  • Source Fragments:        ~82,000
  • Graph Completeness:      {'Perfect' if isolated_provs == 0 else 'High'} ({provs_with_reqs/stats['provisions']*100:.1f}% connectivity)

{'='*80}
STATUS: ✓ Graph is complete and ready for compliance queries
{'='*80}
"""

print(summary)

================================================================================
REGULATORY KNOWLEDGE GRAPH - SUMMARY REPORT
================================================================================

📊 GRAPH SIZE
  • Total Provisions:            51,397
  • Total Requirements:         131,069
  • Total Constraints:          132,202
  • Total Relationships:        480,654
  • Graph Density:           0.000005 (0.0005%)

🌍 COVERAGE
  • Primary Authority:       CFIA (21,683 provisions)
  • Total Authorities:       6
  • Primary Jurisdiction:    Canada (CA)
  • Metadata Coverage:       56.4%

📋 REQUIREMENT BREAKDOWN
  • Top Type:                process (34,057)
  • Unique Types:            144
  • Top 5 Coverage:          90.8%
  • Mandatory (must):        118,739 (90.6%)
  • Optional (may):          6,530 (5.0%)
  • Recommended (should):    5,778 (4.4%)

🔧 CONSTRAINT TYPES
  • Boolean:                 84,569 (64.0%)
  • Threshold:               37,422 (28.3%)
  • Pattern:                 5,747 (4.3%)
  • Enumeration:             4,464 (3.4%)

📈 CONNECTIVITY
  • Connected Provisions:    100.0%
  • Connected Requirements:  89.2%
  • Isolated Provisions:     0
  • Avg Reqs/Provision:      4.70
  • Avg Constraints/Req:     1.01

✅ DATA QUALITY
  • Source Fragments:        ~82,000
    • Graph Completeness:      Perfect (100.0% connectivity)

================================================================================
STATUS: ✓ Graph is complete and ready for compliance queries
================================================================================

Entity	Count	Description
Provisions	51,397	Legal text fragments from regulatory documents
Requirements	131,069	Parsed compliance requirements with deontic modality
Constraints	132,202	Machine-readable conditions (boolean, threshold, pattern, enum)
Relationships	480,654	Semantic connections between entities

Type	Description	Example
process	Manufacturing/handling procedures	"Must pasteurize at 72°C for 15s"
documentation	Record-keeping obligations	"Maintain logs for 2 years"
composition	Ingredient/nutrient specifications	"Sodium ≤ 140mg per serving"
labelling	Label content and format rules	"Must display bilingual net quantity"
prohibition	Forbidden actions/substances	"Must not contain lead > 0.1 ppm"
testing	Analytical testing requirements	"Test for Listeria monthly"
claim	Marketing claim restrictions	"'Low sodium' requires ≤ 140mg"

Modality	Symbol	Meaning	Legal Effect
must	O(φ)	Obligation	Mandatory - failure = non-compliance
must_not	O(¬φ)	Prohibition	Forbidden - violation = non-compliance
may	P(φ)	Permission	Optional - allowed but not required
should	R(φ)	Recommendation	Best practice - non-binding
shall	O(φ)	Obligation (formal)	Legal synonym for "must"

Logic Type	Description	Example	Formal Representation
boolean	Binary condition	"Product must be pasteurized"	`pasteurized = True`
threshold	Numeric comparison	"Sodium ≤ 140mg"	`sodium ≤ 140`
pattern	Regex/format match	"Ingredients in descending order"	`match(/^[A-Z].*/)`
enumeration	Allowed values	"Category ∈ {A, B, C}"	`category ∈ {'A', 'B', 'C'}`

Canada F&B Regulatory Compliance Knowledge Graph Analysis¶

🎯 Overview¶

What's Inside the Graph?¶

🔬 What You'll Learn¶

📚 Table of Contents¶

⚠️ Security Notice¶

1. Environment Setup¶

Prerequisites¶

Database Configuration¶

2. Graph Overview Statistics¶

Entity Counts¶

Visualization: Entity Distribution¶

3. Jurisdiction & Authority Analysis¶

Regulatory Sources¶

4. Requirement Type Analysis¶

Requirement Taxonomy¶

5. Deontic Modality Analysis¶

Legal Force Classification¶

6. Constraint Type Analysis¶

Logic Types¶

7. Random Sampling - Provisions¶

8. Random Sampling - Requirements with Constraints¶

9. Text Analysis - Word Cloud¶

Most Common Terms in Requirements¶

10. Graph Connectivity Analysis¶

Measuring Graph Completeness¶

11. Requirements Depth Distribution¶

How many requirements are derived from each provision?¶

12. Summary Statistics¶

Comprehensive Summary Report¶

	Metric	Count
0	Provisions with Requirements	51397
1	Requirements with Constraints	116906
2	Isolated Provisions (no requirements)	0