Spaces:

Hasan-Atr3
/

cedroPM-bot

Running

App Files Files Community

Hasan-Atris3 commited on 8 days ago

Commit

6f8b70f

1 Parent(s): 793f5d8

my first commitment

Browse files

Files changed (7) hide show

.gitignore +72 -0
app.py +263 -0
chainlit.md +14 -0
db.py +41 -0
main_final.py +318 -0
srd_engine_final.py +359 -0
srd_engine_v2.py +463 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,72 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+env/
+.venv
+# Environment variables
+.env
+.env.local
+.env.*.local
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# Chainlit
+.chainlit/
+# ChromaDB
+chroma_db*/
+chroma_*/
+# Database files
+*.db
+*.sqlite
+*.sqlite3
+cedropass.db
+# Data files (uncomment if you don't want to track these)
+# *.csv
+# *.xlsx
+# *.pdf
+# *.jpg
+# *.png
+# Logs
+*.log
+logs/
+# Temporary files
+.files/
+*.tmp
+*.temp
+# Results and outputs
+results.txt

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# app.py
+import time
+import chainlit as cl
+from srd_engine_v2 import SmartKnowledgeBase, ClaudeAnswerer
+from db import SessionLocal, User, Chat, Message
+claude = ClaudeAnswerer()
+@cl.on_chat_start
+async def start():
+    session = cl.user_session
+    db = SessionLocal()
+    try:
+        # -----------------------
+        # USER IDENTIFICATION
+        # -----------------------
+        if not session.get("user_id"):
+            user = User()
+            db.add(user)
+            db.commit()
+            session.set("user_id", user.id)
+        user_id = session.get("user_id")
+        # -----------------------
+        # CHAT SELECTION
+        # -----------------------
+        chats = db.query(Chat).filter(Chat.user_id == user_id).all()
+        actions = [cl.Action(name="new_chat", payload={}, label="➕ New Project Chat")]
+        for c in chats[-5:]:
+            actions.append(
+                cl.Action(
+                    name="resume_chat",
+                    payload={"chat_id": c.id},
+                    label=f"📂 {c.project_name}"
+                )
+            )
+        res = await cl.AskActionMessage(
+            content="Choose a chat or start a new one:",
+            actions=actions
+        ).send()
+        if not res:
+            return
+        # -----------------------
+        # NEW CHAT
+        # -----------------------
+        if res["name"] == "new_chat":
+            project_res = await cl.AskUserMessage(
+                content="Enter **Project Name**:",
+                timeout=300
+            ).send()
+            if not project_res:
+                return
+            project_name = project_res["output"]
+            learn_res = await cl.AskActionMessage(
+                content="Allow this chat to be saved/learned for improving the bot?",
+                actions=[
+                    cl.Action(name="learn_yes", payload={"v": True}, label="✅ Yes (Enable Learning)"),
+                    cl.Action(name="learn_no", payload={"v": False}, label="❌ No (Do Not Learn)"),
+                ],
+            ).send()
+            learning_enabled = bool(learn_res["payload"]["v"]) if learn_res else True
+            chat = Chat(user_id=user_id, project_name=project_name, learning_enabled=learning_enabled)
+            db.add(chat)
+            db.commit()
+            session.set("chat_id", chat.id)
+            session.set("learning_enabled", chat.learning_enabled)
+            engine = SmartKnowledgeBase(chroma_dir="chroma_global_db")
+            engine.set_current_project(project_name)
+            engine.set_current_chat(chat.id)     # ✅ NEW
+            engine.set_current_user(user_id)     # ✅ NEW
+            session.set("engine", engine)
+            await run_ingestion(engine)
+        # -----------------------
+        # RESUME CHAT
+        # -----------------------
+        else:
+            chat_id = res["payload"]["chat_id"]
+            chat = db.query(Chat).get(chat_id)
+            if not chat:
+                await cl.Message(content="⚠️ Chat not found.").send()
+                return
+            session.set("chat_id", chat.id)
+            session.set("learning_enabled", chat.learning_enabled)
+            engine = SmartKnowledgeBase(chroma_dir="chroma_global_db")
+            engine.set_current_project(chat.project_name)
+            engine.set_current_chat(chat.id)     # ✅ NEW
+            engine.set_current_user(user_id)     # ✅ NEW
+            session.set("engine", engine)
+            # Restore history
+            messages = (
+                db.query(Message)
+                .filter(Message.chat_id == chat_id)
+                .order_by(Message.created_at)
+                .all()
+            )
+            for m in messages:
+                await cl.Message(content=m.content, author=m.role).send()
+    finally:
+        db.close()
+async def run_ingestion(engine: SmartKnowledgeBase):
+    files = await cl.AskFileMessage(
+        content="Upload the **SRD PDF**:",
+        accept=["application/pdf"],
+        max_size_mb=50,
+        timeout=600
+    ).send()
+    if not files:
+        return
+    srd_file_path = files[0].path
+    res = await cl.AskActionMessage(
+        content="Select Diagram Vision Mode:",
+        actions=[
+            cl.Action(name="qwen", payload={"v": "qwen"}, label="Qwen"),
+            cl.Action(name="claude", payload={"v": "claude"}, label="Claude"),
+            cl.Action(name="both", payload={"v": "both"}, label="Both"),
+            cl.Action(name="none", payload={"v": "none"}, label="None"),
+        ]
+    ).send()
+    mode = res["payload"]["v"] if res else "none"
+    use_qwen = mode in ("qwen", "both")
+    use_claude = mode in ("claude", "both")
+    status = cl.Message(content="🚀 Starting ingestion...")
+    await status.send()
+    await cl.make_async(engine.process_document_step)(
+        srd_file_path, "pdf_text", "SRD Main", False, False
+    )
+    status.content += "\n✅ SRD indexed"
+    await status.update()
+    while True:
+        add = await cl.AskActionMessage(
+            content="Add a diagram?",
+            actions=[
+                cl.Action(name="yes", payload={}, label="➕ Add"),
+                cl.Action(name="done", payload={}, label="Done"),
+            ]
+        ).send()
+        if not add or add["name"] == "done":
+            break
+        title = await cl.AskUserMessage(content="Diagram title:", timeout=300).send()
+        if not title:
+            break
+        file = await cl.AskFileMessage(
+            content="Upload diagram:",
+            accept=["image/png", "image/jpeg", "application/pdf"],
+            max_size_mb=20,
+            timeout=600
+        ).send()
+        if not file:
+            break
+        await cl.make_async(engine.process_document_step)(
+            file[0].path, "diagram", title["output"], use_qwen, use_claude
+        )
+        status.content += f"\n🎨 Diagram '{title['output']}' indexed"
+        await status.update()
+    status.content += "\n🎉 Ingestion complete. Ask questions!"
+    await status.update()
+@cl.on_message
+async def main(message: cl.Message):
+    session = cl.user_session
+    db = SessionLocal()
+    try:
+        engine: SmartKnowledgeBase = session.get("engine")
+        chat_id = session.get("chat_id")
+        if not engine or not chat_id:
+            await cl.Message(content="⚠️ Session error. Please refresh.").send()
+            return
+        # Save user msg
+        db.add(Message(chat_id=chat_id, role="user", content=message.content))
+        db.commit()
+        # Answer
+        response = await cl.make_async(engine.generate_smart_response)(message.content, claude)
+        # Save assistant msg
+        db.add(Message(chat_id=chat_id, role="assistant", content=response))
+        db.commit()
+        await cl.Message(content=response).send()
+        # Feedback
+        await cl.Message(
+            content="",
+            actions=[
+                cl.Action(
+                    name="correct",
+                    payload={"original": message.content},
+                    label="🔧 Correct This"
+                )
+            ]
+        ).send()
+    finally:
+        db.close()
+@cl.action_callback("correct")
+async def on_correct(action):
+    session = cl.user_session
+    db = SessionLocal()
+    try:
+        engine: SmartKnowledgeBase = session.get("engine")
+        chat_id = session.get("chat_id")
+        learning_enabled = bool(session.get("learning_enabled", True))
+        await action.remove()
+        res = await cl.AskUserMessage(
+            content="Paste the correct information:",
+            timeout=600
+        ).send()
+        if not res:
+            return
+        # Always store correction text in DB (audit trail)
+        db.add(Message(chat_id=chat_id, role="user_feedback", content=res["output"]))
+        db.commit()
+        # Learn only if allowed
+        if learning_enabled:
+            engine.learn_from_interaction(action.payload["original"], res["output"])
+            await cl.Message(content="✅ Correction saved and learned.").send()
+        else:
+            await cl.Message(content="✅ Correction saved (learning disabled for this chat).").send()
+    finally:
+        db.close()

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

db.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# db.py
+from sqlalchemy import (
+    create_engine, Column, String, Text, DateTime, ForeignKey, Boolean
+)
+from sqlalchemy.orm import declarative_base, sessionmaker
+from datetime import datetime
+import uuid
+Base = declarative_base()
+engine = create_engine("sqlite:///cedropass.db")
+SessionLocal = sessionmaker(bind=engine)
+def gen_id():
+    return str(uuid.uuid4())
+class User(Base):
+    __tablename__ = "users"
+    id = Column(String, primary_key=True, default=gen_id)
+    created_at = Column(DateTime, default=datetime.utcnow)
+class Chat(Base):
+    __tablename__ = "chats"
+    id = Column(String, primary_key=True, default=gen_id)
+    user_id = Column(String, ForeignKey("users.id"))
+    project_name = Column(String)
+    learning_enabled = Column(Boolean, default=True)  # ✅ opt-in learning
+    created_at = Column(DateTime, default=datetime.utcnow)
+class Message(Base):
+    __tablename__ = "messages"
+    id = Column(String, primary_key=True, default=gen_id)
+    chat_id = Column(String, ForeignKey("chats.id"))
+    role = Column(String)  # user / assistant / user_feedback
+    content = Column(Text)
+    created_at = Column(DateTime, default=datetime.utcnow)
+Base.metadata.create_all(engine)

main_final.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# # main2.py
+# import os
+# import time
+# from dotenv import load_dotenv
+# load_dotenv()
+# from srd_engine_final import SRDChatbotEngine, ClaudeAnswerer
+# def yes_no(prompt: str, default: bool = False) -> bool:
+#     """
+#     Simple [y/N] helper.
+#     """
+#     raw = input(prompt).strip().lower()
+#     if not raw:
+#         return default
+#     return raw in ("y", "yes")
+# def main():
+#     engine = SRDChatbotEngine()
+#     claude_text_llm = None  # Lazy init (for final answers)
+#     print("\n" + "=" * 50)
+#     print("      ULTIMATE SRD CO-PILOT (Claude + Qwen2-VL)")
+#     print("=" * 50)
+#     while True:
+#         print("\n1. Index Documents")
+#         print("2. Ask Question")
+#         print("3. Exit")
+#         choice = input("\nChoose: ").strip()
+#         # ----- INDEX -----
+#         if choice == "1":
+#             pdf = input("Enter SRD PDF path: ").strip().strip('"')
+#             if not os.path.exists(pdf):
+#                 print("[ERROR] SRD PDF not found.")
+#                 continue
+#             gantt = (
+#                 input("Gantt Chart path (optional): ").strip().strip('"') or None
+#             )
+#             cls = (
+#                 input("Class Diagram path (optional): ").strip().strip('"') or None
+#             )
+#             seq = (
+#                 input("Sequence Diagram path (optional): ").strip().strip('"')
+#                 or None
+#             )
+#             # Vision choices
+#             print("\nDiagram understanding options:")
+#             # <<< ADDED: Ask user if they want Qwen2-VL >>>
+#             use_qwen_vision = yes_no(
+#                 "Use Qwen2-VL (free, open-source vision)? (y/N): ", default=False
+#             )
+#             # EXISTING CLAUDE OPTION
+#             use_claude_vision = yes_no(
+#                 "Also use Claude Vision for diagrams? (y/N): ", default=False
+#             )
+#             # User feedback
+#             if use_qwen_vision and use_claude_vision:
+#                 print("→ Diagrams will be processed by BOTH Qwen2-VL (free) and Claude Vision (paid).")
+#             elif use_qwen_vision:
+#                 print("→ Diagrams will be processed ONLY by Qwen2-VL (free).")
+#             elif use_claude_vision:
+#                 print("→ Diagrams will be processed ONLY by Claude Vision.")
+#             else:
+#                 print("→ No Vision AI selected. Using OCR only (fastest).")
+#             try:
+#                 engine.build_index(
+#                     pdf_path=pdf,
+#                     gantt_path=gantt,
+#                     class_path=cls,
+#                     seq_path=seq,
+#                     use_qwen_vision=use_qwen_vision,     # <<< CHANGED FROM True
+#                     use_claude_vision=use_claude_vision,
+#                 )
+#                 print("✔ Indexed successfully.")
+#             except Exception as e:
+#                 print(f"[ERROR] Indexing failed: {e}")
+#         # ----- CHAT -----
+#         elif choice == "2":
+#             if not engine.vectorstore:
+#                 print("Please index documents first (Option 1).")
+#                 continue
+#             if claude_text_llm is None:
+#                 try:
+#                     claude_text_llm = ClaudeAnswerer()
+#                     print("✔ Claude (text) initialized for final answers.")
+#                 except Exception as e:
+#                     print(f"[ERROR] Failed to init Claude for answers: {e}")
+#                     print(
+#                         "Make sure 'anthropic' is installed and ANTHROPIC_API_KEY is set."
+#                     )
+#                     continue
+#             while True:
+#                 q = input("\n[You]: ").strip()
+#                 if q.lower() in ("exit", "back", "quit"):
+#                     break
+#                 # <<< ADDED: Total question timer >>>
+#                 total_start = time.time()
+#                 # ----- Retrieval Timer -----
+#                 retrieval_start = time.time()
+#                 try:
+#                     results = engine.hybrid_search(q, top_k=7)
+#                 except Exception as e:
+#                     print(f"[ERROR] Search failed: {e}")
+#                     continue
+#                 retrieval_time = time.time() - retrieval_start
+#                 print(f"[Retrieved in {retrieval_time:.2f}s]")
+#                 if not results:
+#                     print("No matching content found in the SRD or diagrams.")
+#                     continue
+#                 # Debug: show where the info came from
+#                 for r in results:
+#                     src = r["metadata"].get("source")
+#                     sect = r["metadata"].get("section", "N/A")
+#                     score = r["score"]
+#                     print(f" → {src} | section={sect} | score={score:.2f}")
+#                 print("\n--- Claude Answer ---")
+#                 # ----- Claude answering time -----
+#                 claude_start = time.time()
+#                 try:
+#                     answer = claude_text_llm.generate_answer(q, results)
+#                     print(answer)
+#                 except Exception as e:
+#                     print(f"Claude error: {e}")
+#                 claude_time = time.time() - claude_start
+#                 print(f"\n[Claude Answer Time: {claude_time:.2f}s]")
+#                 total_time = time.time() - total_start
+#                 print(f"[Total Time (Question → Final Answer): {total_time:.2f}s]")
+#                 print("---------------------")
+#         # ----- EXIT -----
+#         elif choice == "3":
+#             print("Goodbye.")
+#             break
+#         else:
+#             print("Invalid choice. Please select 1, 2, or 3.")
+# if __name__ == "__main__":
+#     main()
+# main2.py
+import os
+import time
+from dotenv import load_dotenv
+load_dotenv()
+from srd_engine_final import SRDChatbotEngine, ClaudeAnswerer
+def yes_no(prompt: str, default: bool = False) -> bool:
+    """
+    Simple [y/N] helper.
+    """
+    raw = input(prompt).strip().lower()
+    if not raw:
+        return default
+    return raw in ("y", "yes")
+def main():
+    engine = SRDChatbotEngine()
+    claude_text_llm = None  # Lazy init (for final answers)
+    print("\n" + "=" * 50)
+    print("      ULTIMATE SRD CO-PILOT (Claude + Qwen2-VL)")
+    print("=" * 50)
+    while True:
+        print("\n1. Index Documents")
+        print("2. Ask Question")
+        print("3. Exit")
+        choice = input("\nChoose: ").strip()
+        # ----- INDEX -----
+        if choice == "1":
+            pdf = input("Enter SRD PDF path: ").strip().strip('"')
+            if not os.path.exists(pdf):
+                print("[ERROR] SRD PDF not found.")
+                continue
+            gantt = input("Gantt Chart path (optional): ").strip().strip('"') or None
+            cls = input("Class Diagram path (optional): ").strip().strip('"') or None
+            seq = input("Sequence Diagram path (optional): ").strip().strip('"') or None
+            print("\nDiagram understanding options:")
+            use_qwen_vision = yes_no(
+                "Use Qwen2-VL (free, open-source vision)? (y/N): ", default=False
+            )
+            use_claude_vision = yes_no(
+                "Also use Claude Vision for diagrams? (y/N): ", default=False
+            )
+            if use_qwen_vision and use_claude_vision:
+                print("→ Diagrams will be processed by BOTH Qwen2-VL (free) and Claude Vision (paid).")
+            elif use_qwen_vision:
+                print("→ Diagrams will be processed ONLY by Qwen2-VL (free).")
+            elif use_claude_vision:
+                print("→ Diagrams will be processed ONLY by Claude Vision.")
+            else:
+                print("→ No Vision AI selected. Using OCR only (fastest).")
+            try:
+                t0 = time.time()
+                engine.build_index(
+                    pdf_path=pdf,
+                    gantt_path=gantt,
+                    class_path=cls,
+                    seq_path=seq,
+                    use_qwen_vision=use_qwen_vision,
+                    use_claude_vision=use_claude_vision,
+                )
+                t1 = time.time()
+                print(f"✔ Indexed successfully in {t1 - t0:.2f}s.")
+            except Exception as e:
+                print(f"[ERROR] Indexing failed: {e}")
+        # ----- CHAT -----
+        elif choice == "2":
+            if not engine.vectorstore:
+                print("Please index documents first (Option 1).")
+                continue
+            if claude_text_llm is None:
+                try:
+                    claude_text_llm = ClaudeAnswerer()
+                    print("✔ Claude (text) initialized for final answers.")
+                except Exception as e:
+                    print(f"[ERROR] Failed to init Claude for answers: {e}")
+                    print(
+                        "Make sure 'anthropic' is installed and ANTHROPIC_API_KEY is set."
+                    )
+                    continue
+            while True:
+                q = input("\n[You]: ").strip()
+                if q.lower() in ("exit", "back", "quit"):
+                    break
+                total_start = time.time()
+                # Retrieval
+                retrieval_start = time.time()
+                try:
+                    results = engine.hybrid_search(q, top_k=7)
+                except Exception as e:
+                    print(f"[ERROR] Search failed: {e}")
+                    continue
+                retrieval_time = time.time() - retrieval_start
+                print(f"[Retrieved in {retrieval_time:.2f}s]")
+                if not results:
+                    print("No matching content found in the SRD or diagrams.")
+                    continue
+                for r in results:
+                    src = r["metadata"].get("source")
+                    sect = r["metadata"].get("section", "N/A")
+                    score = r["score"]
+                    print(f" → {src} | section={sect} | score={score:.2f}")
+                print("\n--- Claude Answer ---")
+                # Answer
+                claude_start = time.time()
+                try:
+                    answer = claude_text_llm.generate_answer(q, results)
+                    print(answer)
+                except Exception as e:
+                    print(f"Claude error: {e}")
+                claude_time = time.time() - claude_start
+                total_time = time.time() - total_start
+                print("\n[Timings]")
+                print(f" - Retrieval time: {retrieval_time:.2f}s")
+                print(f" - Claude answer call time (wrapper): {claude_time:.2f}s")
+                print(f" - Total time (Question → Final Answer): {total_time:.2f}s")
+                print("---------------------")
+        # ----- EXIT -----
+        elif choice == "3":
+            print("Goodbye.")
+            break
+        else:
+            print("Invalid choice. Please select 1, 2, or 3.")
+if __name__ == "__main__":
+    main()

srd_engine_final.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# srd_engine_final.py
+# ============================================================
+# CedroPass SRD – Final RAG Engine (Stable, Section-Aware)
+# ============================================================
+import os
+import re
+import io
+import base64
+import time
+import shutil
+import warnings
+from typing import List, Dict, Any, Optional
+from dotenv import load_dotenv
+load_dotenv()
+# -------------------- Data Processing --------------------
+import pdfplumber
+import camelot
+from pdf2image import convert_from_path
+import pytesseract
+from PIL import Image
+# -------------------- NLP & Retrieval --------------------
+import spacy
+from sentence_transformers import CrossEncoder
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from langchain_core.documents import Document
+from rapidfuzz import process as fuzz_process
+# -------------------- Claude --------------------
+try:
+    from anthropic import Anthropic
+except ImportError:
+    Anthropic = None
+# -------------------- CONFIG --------------------
+warnings.filterwarnings("ignore")
+Image.MAX_IMAGE_PIXELS = None
+POPPLER_PATH = os.getenv("POPPLER_PATH")
+TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+if os.path.exists(TESSERACT_PATH):
+    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
+# -------------------- NLP MODEL --------------------
+print("[SYSTEM] Loading NLP pipelines...")
+try:
+    NLP_EN = spacy.load("en_core_web_sm")
+except OSError:
+    from spacy.cli import download
+    download("en_core_web_sm")
+    NLP_EN = spacy.load("en_core_web_sm")
+# ============================================================
+# TEXT UTILS
+# ============================================================
+def normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+def lemmatize_text(text: str) -> str:
+    doc = NLP_EN(text[:50000])
+    return " ".join(
+        t.lemma_.lower()
+        for t in doc
+        if not t.is_space and not t.is_punct
+    )
+# ============================================================
+# SECTION-AWARE SRD SPLITTER (CRITICAL FIX)
+# ============================================================
+class SmartSRDSplitter:
+    """
+    Guarantees that ALL child paragraphs inherit the correct
+    section_type until a new header appears.
+    """
+    HEADER_REGEX = re.compile(
+        r"^(\d+(\.\d+)*|FR-\d+|NFR-\d+|[A-Z][A-Za-z\s]{3,}:)",
+        re.IGNORECASE,
+    )
+    def split_text(self, text: str) -> List[Document]:
+        docs: List[Document] = []
+        lines = text.splitlines()
+        buffer: List[str] = []
+        current_section_title = "General"
+        current_section_type = "general"
+        for raw in lines:
+            line = raw.strip()
+            if not line:
+                continue
+            if self.HEADER_REGEX.match(line):
+                # Flush previous chunk
+                if buffer:
+                    docs.append(
+                        Document(
+                            page_content="\n".join(buffer),
+                            metadata={
+                                "type": "text",
+                                "section": current_section_title,
+                                "section_type": current_section_type,
+                                "source": "SRD_Main",
+                            },
+                        )
+                    )
+                buffer = [line]
+                current_section_title = line[:80]
+                lowered = line.lower()
+                if "functional requirement" in lowered or "fr-" in lowered:
+                    current_section_type = "functional"
+                elif "non-functional" in lowered or "nfr-" in lowered:
+                    current_section_type = "nonfunctional"
+                else:
+                    current_section_type = "general"
+            else:
+                buffer.append(line)
+        # Final flush
+        if buffer:
+            docs.append(
+                Document(
+                    page_content="\n".join(buffer),
+                    metadata={
+                        "type": "text",
+                        "section": current_section_title,
+                        "section_type": current_section_type,
+                        "source": "SRD_Main",
+                    },
+                )
+            )
+        return docs
+# ============================================================
+# PDF EXTRACTORS
+# ============================================================
+def extract_pdf_text(path: str) -> str:
+    text = ""
+    with pdfplumber.open(path) as pdf:
+        for p in pdf.pages:
+            t = p.extract_text()
+            if t:
+                text += t + "\n"
+    return text
+def extract_tables(path: str) -> List[Document]:
+    docs: List[Document] = []
+    try:
+        tables = camelot.read_pdf(path, pages="all", flavor="stream")
+        for i, t in enumerate(tables):
+            md = t.df.to_markdown(index=False)
+            if len(md) > 30:
+                docs.append(
+                    Document(
+                        page_content=md,
+                        metadata={
+                            "type": "table",
+                            "section_type": "general",
+                            "source": "SRD_Table",
+                        },
+                    )
+                )
+    except Exception:
+        pass
+    return docs
+# ============================================================
+# DIAGRAM INTERPRETER (TEXT-ONLY SAFE)
+# ============================================================
+class DiagramInterpreter:
+    def __init__(self):
+        self.client = (
+            Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+            if Anthropic and os.getenv("ANTHROPIC_API_KEY")
+            else None
+        )
+    def describe(self, image: Image.Image, label: str) -> str:
+        if not self.client:
+            return pytesseract.image_to_string(image)
+        buf = io.BytesIO()
+        image.convert("RGB").save(buf, format="JPEG", quality=85)
+        b64 = base64.b64encode(buf.getvalue()).decode()
+        resp = self.client.messages.create(
+            model="claude-sonnet-4-5-20250929",
+            max_tokens=600,
+            temperature=0.2,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": f"Explain this {label} diagram for an SRD."},
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/jpeg",
+                                "data": b64,
+                            },
+                        },
+                    ],
+                }
+            ],
+        )
+        return resp.content[0].text
+# ============================================================
+# CORE RAG ENGINE
+# ============================================================
+class SRDChatbotEngine:
+    def __init__(self, chroma_dir: str = "chroma_db_final"):
+        print("[ENGINE] Initializing retrievers...")
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+        self.chroma_dir = chroma_dir
+        self.vectorstore: Optional[Chroma] = None
+        self.chroma_retriever = None
+        self.bm25_retriever: Optional[BM25Retriever] = None
+        self.vocab = set()
+    # -------------------- BUILD INDEX --------------------
+    def build_index(
+        self,
+        pdf_path: str,
+        diagrams: Optional[List[str]] = None,
+    ):
+        if os.path.exists(self.chroma_dir):
+            shutil.rmtree(self.chroma_dir)
+        splitter = SmartSRDSplitter()
+        docs = splitter.split_text(extract_pdf_text(pdf_path))
+        docs.extend(extract_tables(pdf_path))
+        for d in docs:
+            d.metadata["lemma"] = lemmatize_text(d.page_content)
+            for w in d.page_content.split():
+                if w.isalnum():
+                    self.vocab.add(w.lower())
+        self.vectorstore = Chroma.from_documents(
+            docs,
+            embedding=self.embedding_model,
+            persist_directory=self.chroma_dir,
+            collection_name="srd_final",
+        )
+        self.chroma_retriever = self.vectorstore.as_retriever(search_kwargs={"k": 20})
+        self.bm25_retriever = BM25Retriever.from_documents(docs)
+        self.bm25_retriever.k = 20
+        print(f"✅ Indexed {len(docs)} SRD chunks")
+    # -------------------- INTENT --------------------
+    def detect_intent(self, q: str) -> str:
+        q = q.lower()
+        if any(w in q for w in ["list", "enumerate", "all functional", "requirements of"]):
+            return "enumeration"
+        return "qa"
+    # -------------------- ENUMERATION (NO SIM SEARCH) --------------------
+    def list_functional_requirements(self) -> List[str]:
+        data = self.vectorstore.get(
+            where={"section_type": "functional"}
+        )
+        return data.get("documents", [])
+    # -------------------- QUERY --------------------
+    def answer(self, query: str, claude) -> str:
+        intent = self.detect_intent(query)
+        if intent == "enumeration":
+            items = self.list_functional_requirements()
+            if not items:
+                return "I could not find sufficient information in the provided SRD."
+            prompt = f"""
+You are a Senior Project Architect.
+List ALL functional requirements below.
+Do not merge, summarize, or invent anything.
+REQUIREMENTS:
+{chr(10).join(items)}
+"""
+            return claude.generate_raw(prompt)
+        # ---------- Normal QA ----------
+        dense = self.chroma_retriever.invoke(query)
+        sparse = self.bm25_retriever.invoke(query)
+        pool = dense + sparse
+        pairs = [[query, d.page_content] for d in pool]
+        scores = self.reranker.predict(pairs)
+        top = [
+            d.page_content
+            for d, s in sorted(zip(pool, scores), key=lambda x: x[1], reverse=True)
+            if s > -6
+        ][:8]
+        if not top:
+            return "I could not find sufficient information in the provided SRD."
+        ctx = "\n---\n".join(top[:4000])
+        prompt = f"""
+Answer using ONLY the SRD context below.
+If unsupported, say so explicitly.
+CONTEXT:
+{ctx}
+QUESTION:
+{query}
+"""
+        return claude.generate_raw(prompt)
+# ============================================================
+# CLAUDE ANSWERER
+# ============================================================
+class ClaudeAnswerer:
+    def __init__(self):
+        if Anthropic is None:
+            raise RuntimeError("anthropic not installed")
+        self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+        self.model = "claude-sonnet-4-5-20250929"
+    def generate_raw(self, prompt: str) -> str:
+        resp = self.client.messages.create(
+            model=self.model,
+            max_tokens=1200,
+            temperature=0.2,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return resp.content[0].text

srd_engine_v2.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# srd_engine_v2.py
+import os
+import re
+import io
+import base64
+import hashlib
+from datetime import datetime
+from typing import List, Optional
+from dotenv import load_dotenv
+load_dotenv()
+# -------------------- Data Processing --------------------
+import pdfplumber
+import camelot
+from pdf2image import convert_from_path, pdfinfo_from_path
+import pytesseract
+from PIL import Image
+# -------------------- Vector Store --------------------
+from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+# -------------------- Claude --------------------
+try:
+    from anthropic import Anthropic
+except ImportError:
+    Anthropic = None
+from srd_engine_final import SRDChatbotEngine, ClaudeAnswerer
+POPPLER_PATH = os.getenv("POPPLER_PATH")
+# =====================================================
+# UTILS
+# =====================================================
+def content_hash(text: str) -> str:
+    return hashlib.md5(text.encode("utf-8")).hexdigest()
+def resize_for_claude(image: Image.Image, max_dim: int = 7900) -> Image.Image:
+    w, h = image.size
+    if w <= max_dim and h <= max_dim:
+        return image
+    scale = min(max_dim / w, max_dim / h)
+    return image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+# =====================================================
+# SECTION / HEADER DETECTION
+# =====================================================
+SECTION_PATTERNS = {
+    "functional": re.compile(r"(functional\s+requirements|FR-\d+)", re.I),
+    "nonfunctional": re.compile(r"(non[-\s]?functional\s+requirements|NFR-\d+)", re.I),
+}
+def detect_section_type(text: str) -> str:
+    for k, pat in SECTION_PATTERNS.items():
+        if pat.search(text):
+            return k
+    return "general"
+# =====================================================
+# SRD-AWARE SPLITTER (REQUIREMENT SAFE)
+# =====================================================
+class SmartSRDSplitter:
+    HEADER_REGEX = re.compile(
+        r"(FR-\d+|NFR-\d+|\d+\.\d+|[A-Z][A-Za-z\s]{3,}:)",
+        re.I
+    )
+    def split_text(self, text: str) -> List[Document]:
+        docs: List[Document] = []
+        lines = [l.strip() for l in text.splitlines() if l.strip()]
+        buffer: List[str] = []
+        current_header = "General"
+        for line in lines:
+            if self.HEADER_REGEX.match(line):
+                if buffer:
+                    content = "\n".join(buffer)
+                    docs.append(
+                        Document(
+                            page_content=content,
+                            metadata={
+                                "type": "text",
+                                "header": current_header,
+                                "section_type": detect_section_type(content),
+                            },
+                        )
+                    )
+                buffer = [line]
+                current_header = line[:80]
+            else:
+                buffer.append(line)
+        if buffer:
+            content = "\n".join(buffer)
+            docs.append(
+                Document(
+                    page_content=content,
+                    metadata={
+                        "type": "text",
+                        "header": current_header,
+                        "section_type": detect_section_type(content),
+                    },
+                )
+            )
+        return docs
+# =====================================================
+# DIAGRAM INTERPRETER
+# =====================================================
+class DiagramInterpreter:
+    def __init__(self):
+        self._anthropic = None
+    def process_image(
+        self,
+        image: Image.Image,
+        label: str,
+        use_qwen: bool,
+        use_claude: bool
+    ) -> str:
+        sections: List[str] = []
+        if use_claude:
+            if Anthropic is None:
+                sections.append("Claude Vision requested but anthropic package is not installed.")
+            else:
+                if not self._anthropic:
+                    api_key = os.getenv("ANTHROPIC_API_KEY")
+                    if not api_key:
+                        sections.append("Claude Vision requested but ANTHROPIC_API_KEY is not set.")
+                    else:
+                        self._anthropic = Anthropic(api_key=api_key)
+                if self._anthropic:
+                    safe_image = resize_for_claude(image)
+                    buf = io.BytesIO()
+                    safe_image.convert("RGB").save(buf, format="JPEG", quality=85)
+                    b64 = base64.b64encode(buf.getvalue()).decode()
+                    resp = self._anthropic.messages.create(
+                        model=os.getenv("CLAUDE_VISION_MODEL", "claude-sonnet-4-5-20250929"),
+                        max_tokens=600,
+                        temperature=0.2,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {"type": "text", "text": f"Explain this {label} diagram for an SRD."},
+                                    {
+                                        "type": "image",
+                                        "source": {"type": "base64", "media_type": "image/jpeg", "data": b64},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    try:
+                        text0 = resp.content[0].text  # type: ignore[attr-defined]
+                    except Exception:
+                        text0 = ""
+                        for block in getattr(resp, "content", []):
+                            t = getattr(block, "text", None)
+                            if t:
+                                text0 += t + "\n"
+                        text0 = text0.strip()
+                    if text0:
+                        sections.append(text0)
+        if not sections:
+            sections.append(pytesseract.image_to_string(image))
+        return "\n\n".join([s for s in sections if s.strip()]).strip()
+# =====================================================
+# SMART KNOWLEDGE BASE (MULTI-USER + MULTI-CHAT SAFE)
+# =====================================================
+class SmartKnowledgeBase(SRDChatbotEngine):
+    def __init__(self, chroma_dir="chroma_global_db"):
+        super().__init__(chroma_dir)
+        self.current_project_id: Optional[str] = None
+        self.current_chat_id: Optional[str] = None   # ✅ NEW
+        self.current_user_id: Optional[str] = None   # ✅ NEW
+        self.vectorstore = Chroma(
+            persist_directory=chroma_dir,
+            embedding_function=self.embedding_model,
+            collection_name="srd_knowledge"
+        )
+        self.interpreter = DiagramInterpreter()
+    # ------------------------------
+    # SESSION SCOPING
+    # ------------------------------
+    def set_current_project(self, name: str):
+        self.current_project_id = name.lower().replace(" ", "_")
+    def set_current_chat(self, chat_id: str):
+        self.current_chat_id = chat_id
+    def set_current_user(self, user_id: str):
+        self.current_user_id = user_id
+    def _require_scope(self):
+        if not self.current_project_id:
+            raise RuntimeError("Project not set. Call set_current_project(...) first.")
+        if not self.current_chat_id:
+            raise RuntimeError("Chat not set. Call set_current_chat(...) first.")
+        if not self.current_user_id:
+            raise RuntimeError("User not set. Call set_current_user(...) first.")
+    def _where_scope(self) -> dict:
+        # Chroma where filter (strict isolation)
+        return {
+            "$and": [
+                {"project_id": {"$eq": self.current_project_id}},
+                {"chat_id": {"$eq": self.current_chat_id}},
+                {"user_id": {"$eq": self.current_user_id}},
+            ]
+        }
+    # ------------------------------
+    # LEARNING / USER CORRECTION
+    # ------------------------------
+    def learn_from_interaction(self, query: str, correction_text: str):
+        self._require_scope()
+        if not correction_text or not correction_text.strip():
+            return
+        inferred = detect_section_type(correction_text)
+        if inferred == "general":
+            inferred = self.detect_requirement_type(query)
+        doc = Document(
+            page_content=correction_text.strip(),
+            metadata={
+                "type": "user_correction",
+                "section_type": inferred,
+                "project_id": self.current_project_id,
+                "chat_id": self.current_chat_id,
+                "user_id": self.current_user_id,
+                "source": "user_feedback",
+                "timestamp": datetime.now().isoformat(),
+                "original_query": query,
+                "priority": "high",
+            },
+        )
+        self.vectorstore.add_documents([doc])
+        self.vectorstore.persist()
+    # ------------------------------
+    # INGESTION
+    # ------------------------------
+    def process_document_step(self, path, ftype, label, use_qwen, use_claude):
+        self._require_scope()
+        docs: List[Document] = []
+        if ftype == "pdf_text":
+            with pdfplumber.open(path) as pdf:
+                text = "\n".join((p.extract_text() or "") for p in pdf.pages)
+            splitter = SmartSRDSplitter()
+            docs = splitter.split_text(text)
+            try:
+                tables = camelot.read_pdf(path, pages="all", flavor="stream")
+                for t in tables:
+                    docs.append(
+                        Document(
+                            page_content=t.df.to_markdown(),
+                            metadata={"type": "table", "section_type": "general"},
+                        )
+                    )
+            except Exception:
+                pass
+        elif ftype == "diagram":
+            if path.lower().endswith(".pdf"):
+                info = pdfinfo_from_path(path, poppler_path=POPPLER_PATH)
+                for page in range(1, info["Pages"] + 1):
+                    imgs = convert_from_path(
+                        path,
+                        first_page=page,
+                        last_page=page,
+                        dpi=150,
+                        poppler_path=POPPLER_PATH,
+                    )
+                    for img in imgs:
+                        txt = self.interpreter.process_image(img, label, use_qwen, use_claude)
+                        docs.append(Document(page_content=txt, metadata={"type": "diagram", "section_type": "general"}))
+            else:
+                img = Image.open(path)
+                txt = self.interpreter.process_image(img, label, use_qwen, use_claude)
+                docs.append(Document(page_content=txt, metadata={"type": "diagram", "section_type": "general"}))
+        # ------------------------------
+        # Dedup + metadata
+        # ------------------------------
+        seen = set()
+        final_docs: List[Document] = []
+        for d in docs:
+            h = content_hash(d.page_content or "")
+            if h in seen:
+                continue
+            seen.add(h)
+            d.metadata["project_id"] = self.current_project_id
+            d.metadata["chat_id"] = self.current_chat_id
+            d.metadata["user_id"] = self.current_user_id
+            d.metadata["timestamp"] = datetime.now().isoformat()
+            final_docs.append(d)
+        if final_docs:
+            self.vectorstore.add_documents(final_docs)
+            self.vectorstore.persist()
+        return final_docs
+    # ------------------------------
+    # INTENT DETECTION
+    # ------------------------------
+    def detect_intent(self, query: str) -> str:
+        q = (query or "").lower()
+        if any(w in q for w in ["list", "show all", "enumerate", "give me all", "all of the"]):
+            return "enumeration"
+        if any(w in q for w in ["explain", "describe", "how", "why", "what is", "what are"]):
+            return "explanation"
+        return "lookup"
+    # ------------------------------
+    # REQUIREMENT TYPE DETECTION
+    # ------------------------------
+    def detect_requirement_type(self, query: str) -> str:
+        q = (query or "").lower()
+        if any(w in q for w in [
+            "non functional", "non-functional", "nonfunctional", "nfr", "nfrs",
+            "quality attributes", "quality requirements"
+        ]):
+            return "nonfunctional"
+        if any(w in q for w in [
+            "performance", "security", "availability", "reliability", "scalability",
+            "usability", "maintainability", "portability", "compliance", "privacy",
+            "latency", "throughput", "encryption", "audit", "logging", "backup",
+        ]):
+            return "nonfunctional"
+        if any(w in q for w in ["functional", "fr-", "frs", "use case", "features"]):
+            return "functional"
+        return "functional"
+    # ------------------------------
+    # SMART RESPONSE (CHAT-ISOLATED)
+    # ------------------------------
+    def generate_smart_response(self, query: str, claude: ClaudeAnswerer) -> str:
+        self._require_scope()
+        intent = self.detect_intent(query)
+        # =============== ENUMERATION MODE ===============
+        if intent == "enumeration":
+            req_type = self.detect_requirement_type(query)
+            raw = self.vectorstore.get(
+                where={
+                    "$and": [
+                        {"project_id": {"$eq": self.current_project_id}},
+                        {"chat_id": {"$eq": self.current_chat_id}},
+                        {"user_id": {"$eq": self.current_user_id}},
+                        {"section_type": {"$eq": req_type}},
+                    ]
+                }
+            )
+            docs = raw.get("documents", []) or []
+            if not docs and req_type == "nonfunctional":
+                raw2 = self.vectorstore.get(
+                    where={
+                        "$and": [
+                            {"project_id": {"$eq": self.current_project_id}},
+                            {"chat_id": {"$eq": self.current_chat_id}},
+                            {"user_id": {"$eq": self.current_user_id}},
+                            {"section_type": {"$eq": "general"}},
+                        ]
+                    }
+                )
+                docs2 = raw2.get("documents", []) or []
+                if docs2:
+                    docs = docs2
+            if not docs:
+                return "I could not find sufficient information in the provided SRD."
+            title = "FUNCTIONAL REQUIREMENTS" if req_type == "functional" else "NON-FUNCTIONAL REQUIREMENTS"
+            prompt = f"""
+You are a Senior Project Architect.
+Return a COMPLETE numbered list of the {title} found below.
+Do NOT invent items. Do NOT omit items. If duplicates exist, keep only one copy.
+REQUIREMENTS:
+{chr(10).join(docs)}
+"""
+            return claude.client.messages.create(
+                model=claude.model,
+                max_tokens=1400,
+                temperature=0.2,
+                messages=[{"role": "user", "content": prompt}],
+            ).content[0].text
+        # =============== NORMAL QA MODE ===============
+        docs = self.vectorstore.similarity_search(
+            query,
+            k=12,
+            filter=self._where_scope(),  # ✅ chat + user + project scoped
+        )
+        if not docs:
+            return "I could not find sufficient information in the provided SRD."
+        ctx = ""
+        for d in docs[:8]:
+            ctx += f"[{d.metadata.get('header', 'SRD')}]\n{d.page_content}\n---\n"
+        prompt = f"""
+You are a Senior Project Architect.
+Answer ONLY using the SRD context.
+If unsupported, say so explicitly.
+CONTEXT:
+{ctx}
+QUESTION:
+{query}
+"""
+        return claude.client.messages.create(
+            model=claude.model,
+            max_tokens=1000,
+            temperature=0.3,
+            messages=[{"role": "user", "content": prompt}],
+        ).content[0].text