Hasan-Atris3 commited on
Commit
6f8b70f
·
1 Parent(s): 793f5d8

my first commitment

Browse files
Files changed (7) hide show
  1. .gitignore +72 -0
  2. app.py +263 -0
  3. chainlit.md +14 -0
  4. db.py +41 -0
  5. main_final.py +318 -0
  6. srd_engine_final.py +359 -0
  7. srd_engine_v2.py +463 -0
.gitignore ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ ENV/
26
+ env/
27
+ .venv
28
+
29
+ # Environment variables
30
+ .env
31
+ .env.local
32
+ .env.*.local
33
+
34
+ # IDE
35
+ .vscode/
36
+ .idea/
37
+ *.swp
38
+ *.swo
39
+ *~
40
+ .DS_Store
41
+
42
+ # Chainlit
43
+ .chainlit/
44
+
45
+ # ChromaDB
46
+ chroma_db*/
47
+ chroma_*/
48
+
49
+ # Database files
50
+ *.db
51
+ *.sqlite
52
+ *.sqlite3
53
+ cedropass.db
54
+
55
+ # Data files (uncomment if you don't want to track these)
56
+ # *.csv
57
+ # *.xlsx
58
+ # *.pdf
59
+ # *.jpg
60
+ # *.png
61
+
62
+ # Logs
63
+ *.log
64
+ logs/
65
+
66
+ # Temporary files
67
+ .files/
68
+ *.tmp
69
+ *.temp
70
+
71
+ # Results and outputs
72
+ results.txt
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import time
3
+ import chainlit as cl
4
+
5
+ from srd_engine_v2 import SmartKnowledgeBase, ClaudeAnswerer
6
+ from db import SessionLocal, User, Chat, Message
7
+
8
+ claude = ClaudeAnswerer()
9
+
10
+
11
+ @cl.on_chat_start
12
+ async def start():
13
+ session = cl.user_session
14
+ db = SessionLocal()
15
+ try:
16
+ # -----------------------
17
+ # USER IDENTIFICATION
18
+ # -----------------------
19
+ if not session.get("user_id"):
20
+ user = User()
21
+ db.add(user)
22
+ db.commit()
23
+ session.set("user_id", user.id)
24
+
25
+ user_id = session.get("user_id")
26
+
27
+ # -----------------------
28
+ # CHAT SELECTION
29
+ # -----------------------
30
+ chats = db.query(Chat).filter(Chat.user_id == user_id).all()
31
+
32
+ actions = [cl.Action(name="new_chat", payload={}, label="➕ New Project Chat")]
33
+ for c in chats[-5:]:
34
+ actions.append(
35
+ cl.Action(
36
+ name="resume_chat",
37
+ payload={"chat_id": c.id},
38
+ label=f"📂 {c.project_name}"
39
+ )
40
+ )
41
+
42
+ res = await cl.AskActionMessage(
43
+ content="Choose a chat or start a new one:",
44
+ actions=actions
45
+ ).send()
46
+
47
+ if not res:
48
+ return
49
+
50
+ # -----------------------
51
+ # NEW CHAT
52
+ # -----------------------
53
+ if res["name"] == "new_chat":
54
+ project_res = await cl.AskUserMessage(
55
+ content="Enter **Project Name**:",
56
+ timeout=300
57
+ ).send()
58
+ if not project_res:
59
+ return
60
+ project_name = project_res["output"]
61
+
62
+ learn_res = await cl.AskActionMessage(
63
+ content="Allow this chat to be saved/learned for improving the bot?",
64
+ actions=[
65
+ cl.Action(name="learn_yes", payload={"v": True}, label="✅ Yes (Enable Learning)"),
66
+ cl.Action(name="learn_no", payload={"v": False}, label="❌ No (Do Not Learn)"),
67
+ ],
68
+ ).send()
69
+ learning_enabled = bool(learn_res["payload"]["v"]) if learn_res else True
70
+
71
+ chat = Chat(user_id=user_id, project_name=project_name, learning_enabled=learning_enabled)
72
+ db.add(chat)
73
+ db.commit()
74
+
75
+ session.set("chat_id", chat.id)
76
+ session.set("learning_enabled", chat.learning_enabled)
77
+
78
+ engine = SmartKnowledgeBase(chroma_dir="chroma_global_db")
79
+ engine.set_current_project(project_name)
80
+ engine.set_current_chat(chat.id) # ✅ NEW
81
+ engine.set_current_user(user_id) # ✅ NEW
82
+ session.set("engine", engine)
83
+
84
+ await run_ingestion(engine)
85
+
86
+ # -----------------------
87
+ # RESUME CHAT
88
+ # -----------------------
89
+ else:
90
+ chat_id = res["payload"]["chat_id"]
91
+ chat = db.query(Chat).get(chat_id)
92
+ if not chat:
93
+ await cl.Message(content="⚠️ Chat not found.").send()
94
+ return
95
+
96
+ session.set("chat_id", chat.id)
97
+ session.set("learning_enabled", chat.learning_enabled)
98
+
99
+ engine = SmartKnowledgeBase(chroma_dir="chroma_global_db")
100
+ engine.set_current_project(chat.project_name)
101
+ engine.set_current_chat(chat.id) # ✅ NEW
102
+ engine.set_current_user(user_id) # ✅ NEW
103
+ session.set("engine", engine)
104
+
105
+ # Restore history
106
+ messages = (
107
+ db.query(Message)
108
+ .filter(Message.chat_id == chat_id)
109
+ .order_by(Message.created_at)
110
+ .all()
111
+ )
112
+ for m in messages:
113
+ await cl.Message(content=m.content, author=m.role).send()
114
+
115
+ finally:
116
+ db.close()
117
+
118
+
119
+ async def run_ingestion(engine: SmartKnowledgeBase):
120
+ files = await cl.AskFileMessage(
121
+ content="Upload the **SRD PDF**:",
122
+ accept=["application/pdf"],
123
+ max_size_mb=50,
124
+ timeout=600
125
+ ).send()
126
+ if not files:
127
+ return
128
+
129
+ srd_file_path = files[0].path
130
+
131
+ res = await cl.AskActionMessage(
132
+ content="Select Diagram Vision Mode:",
133
+ actions=[
134
+ cl.Action(name="qwen", payload={"v": "qwen"}, label="Qwen"),
135
+ cl.Action(name="claude", payload={"v": "claude"}, label="Claude"),
136
+ cl.Action(name="both", payload={"v": "both"}, label="Both"),
137
+ cl.Action(name="none", payload={"v": "none"}, label="None"),
138
+ ]
139
+ ).send()
140
+
141
+ mode = res["payload"]["v"] if res else "none"
142
+ use_qwen = mode in ("qwen", "both")
143
+ use_claude = mode in ("claude", "both")
144
+
145
+ status = cl.Message(content="🚀 Starting ingestion...")
146
+ await status.send()
147
+
148
+ await cl.make_async(engine.process_document_step)(
149
+ srd_file_path, "pdf_text", "SRD Main", False, False
150
+ )
151
+
152
+ status.content += "\n✅ SRD indexed"
153
+ await status.update()
154
+
155
+ while True:
156
+ add = await cl.AskActionMessage(
157
+ content="Add a diagram?",
158
+ actions=[
159
+ cl.Action(name="yes", payload={}, label="➕ Add"),
160
+ cl.Action(name="done", payload={}, label="Done"),
161
+ ]
162
+ ).send()
163
+
164
+ if not add or add["name"] == "done":
165
+ break
166
+
167
+ title = await cl.AskUserMessage(content="Diagram title:", timeout=300).send()
168
+ if not title:
169
+ break
170
+
171
+ file = await cl.AskFileMessage(
172
+ content="Upload diagram:",
173
+ accept=["image/png", "image/jpeg", "application/pdf"],
174
+ max_size_mb=20,
175
+ timeout=600
176
+ ).send()
177
+
178
+ if not file:
179
+ break
180
+
181
+ await cl.make_async(engine.process_document_step)(
182
+ file[0].path, "diagram", title["output"], use_qwen, use_claude
183
+ )
184
+
185
+ status.content += f"\n🎨 Diagram '{title['output']}' indexed"
186
+ await status.update()
187
+
188
+ status.content += "\n🎉 Ingestion complete. Ask questions!"
189
+ await status.update()
190
+
191
+
192
+ @cl.on_message
193
+ async def main(message: cl.Message):
194
+ session = cl.user_session
195
+ db = SessionLocal()
196
+ try:
197
+ engine: SmartKnowledgeBase = session.get("engine")
198
+ chat_id = session.get("chat_id")
199
+
200
+ if not engine or not chat_id:
201
+ await cl.Message(content="⚠️ Session error. Please refresh.").send()
202
+ return
203
+
204
+ # Save user msg
205
+ db.add(Message(chat_id=chat_id, role="user", content=message.content))
206
+ db.commit()
207
+
208
+ # Answer
209
+ response = await cl.make_async(engine.generate_smart_response)(message.content, claude)
210
+
211
+ # Save assistant msg
212
+ db.add(Message(chat_id=chat_id, role="assistant", content=response))
213
+ db.commit()
214
+
215
+ await cl.Message(content=response).send()
216
+
217
+ # Feedback
218
+ await cl.Message(
219
+ content="",
220
+ actions=[
221
+ cl.Action(
222
+ name="correct",
223
+ payload={"original": message.content},
224
+ label="🔧 Correct This"
225
+ )
226
+ ]
227
+ ).send()
228
+
229
+ finally:
230
+ db.close()
231
+
232
+
233
+ @cl.action_callback("correct")
234
+ async def on_correct(action):
235
+ session = cl.user_session
236
+ db = SessionLocal()
237
+ try:
238
+ engine: SmartKnowledgeBase = session.get("engine")
239
+ chat_id = session.get("chat_id")
240
+ learning_enabled = bool(session.get("learning_enabled", True))
241
+
242
+ await action.remove()
243
+
244
+ res = await cl.AskUserMessage(
245
+ content="Paste the correct information:",
246
+ timeout=600
247
+ ).send()
248
+ if not res:
249
+ return
250
+
251
+ # Always store correction text in DB (audit trail)
252
+ db.add(Message(chat_id=chat_id, role="user_feedback", content=res["output"]))
253
+ db.commit()
254
+
255
+ # Learn only if allowed
256
+ if learning_enabled:
257
+ engine.learn_from_interaction(action.payload["original"], res["output"])
258
+ await cl.Message(content="✅ Correction saved and learned.").send()
259
+ else:
260
+ await cl.Message(content="✅ Correction saved (learning disabled for this chat).").send()
261
+
262
+ finally:
263
+ db.close()
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! 🚀🤖
2
+
3
+ Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links 🔗
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! 💻😊
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
db.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # db.py
2
+ from sqlalchemy import (
3
+ create_engine, Column, String, Text, DateTime, ForeignKey, Boolean
4
+ )
5
+ from sqlalchemy.orm import declarative_base, sessionmaker
6
+ from datetime import datetime
7
+ import uuid
8
+
9
+ Base = declarative_base()
10
+ engine = create_engine("sqlite:///cedropass.db")
11
+ SessionLocal = sessionmaker(bind=engine)
12
+
13
+ def gen_id():
14
+ return str(uuid.uuid4())
15
+
16
+
17
+ class User(Base):
18
+ __tablename__ = "users"
19
+ id = Column(String, primary_key=True, default=gen_id)
20
+ created_at = Column(DateTime, default=datetime.utcnow)
21
+
22
+
23
+ class Chat(Base):
24
+ __tablename__ = "chats"
25
+ id = Column(String, primary_key=True, default=gen_id)
26
+ user_id = Column(String, ForeignKey("users.id"))
27
+ project_name = Column(String)
28
+ learning_enabled = Column(Boolean, default=True) # ✅ opt-in learning
29
+ created_at = Column(DateTime, default=datetime.utcnow)
30
+
31
+
32
+ class Message(Base):
33
+ __tablename__ = "messages"
34
+ id = Column(String, primary_key=True, default=gen_id)
35
+ chat_id = Column(String, ForeignKey("chats.id"))
36
+ role = Column(String) # user / assistant / user_feedback
37
+ content = Column(Text)
38
+ created_at = Column(DateTime, default=datetime.utcnow)
39
+
40
+
41
+ Base.metadata.create_all(engine)
main_final.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # main2.py
2
+
3
+ # import os
4
+ # import time
5
+ # from dotenv import load_dotenv
6
+
7
+ # load_dotenv()
8
+
9
+ # from srd_engine_final import SRDChatbotEngine, ClaudeAnswerer
10
+
11
+
12
+ # def yes_no(prompt: str, default: bool = False) -> bool:
13
+ # """
14
+ # Simple [y/N] helper.
15
+ # """
16
+ # raw = input(prompt).strip().lower()
17
+ # if not raw:
18
+ # return default
19
+ # return raw in ("y", "yes")
20
+
21
+
22
+ # def main():
23
+ # engine = SRDChatbotEngine()
24
+ # claude_text_llm = None # Lazy init (for final answers)
25
+
26
+ # print("\n" + "=" * 50)
27
+ # print(" ULTIMATE SRD CO-PILOT (Claude + Qwen2-VL)")
28
+ # print("=" * 50)
29
+
30
+ # while True:
31
+ # print("\n1. Index Documents")
32
+ # print("2. Ask Question")
33
+ # print("3. Exit")
34
+
35
+ # choice = input("\nChoose: ").strip()
36
+
37
+ # # ----- INDEX -----
38
+ # if choice == "1":
39
+ # pdf = input("Enter SRD PDF path: ").strip().strip('"')
40
+ # if not os.path.exists(pdf):
41
+ # print("[ERROR] SRD PDF not found.")
42
+ # continue
43
+
44
+ # gantt = (
45
+ # input("Gantt Chart path (optional): ").strip().strip('"') or None
46
+ # )
47
+ # cls = (
48
+ # input("Class Diagram path (optional): ").strip().strip('"') or None
49
+ # )
50
+ # seq = (
51
+ # input("Sequence Diagram path (optional): ").strip().strip('"')
52
+ # or None
53
+ # )
54
+
55
+ # # Vision choices
56
+ # print("\nDiagram understanding options:")
57
+
58
+ # # <<< ADDED: Ask user if they want Qwen2-VL >>>
59
+ # use_qwen_vision = yes_no(
60
+ # "Use Qwen2-VL (free, open-source vision)? (y/N): ", default=False
61
+ # )
62
+
63
+ # # EXISTING CLAUDE OPTION
64
+ # use_claude_vision = yes_no(
65
+ # "Also use Claude Vision for diagrams? (y/N): ", default=False
66
+ # )
67
+
68
+ # # User feedback
69
+ # if use_qwen_vision and use_claude_vision:
70
+ # print("→ Diagrams will be processed by BOTH Qwen2-VL (free) and Claude Vision (paid).")
71
+ # elif use_qwen_vision:
72
+ # print("→ Diagrams will be processed ONLY by Qwen2-VL (free).")
73
+ # elif use_claude_vision:
74
+ # print("→ Diagrams will be processed ONLY by Claude Vision.")
75
+ # else:
76
+ # print("→ No Vision AI selected. Using OCR only (fastest).")
77
+
78
+ # try:
79
+ # engine.build_index(
80
+ # pdf_path=pdf,
81
+ # gantt_path=gantt,
82
+ # class_path=cls,
83
+ # seq_path=seq,
84
+ # use_qwen_vision=use_qwen_vision, # <<< CHANGED FROM True
85
+ # use_claude_vision=use_claude_vision,
86
+ # )
87
+ # print("✔ Indexed successfully.")
88
+ # except Exception as e:
89
+ # print(f"[ERROR] Indexing failed: {e}")
90
+
91
+ # # ----- CHAT -----
92
+ # elif choice == "2":
93
+ # if not engine.vectorstore:
94
+ # print("Please index documents first (Option 1).")
95
+ # continue
96
+
97
+ # if claude_text_llm is None:
98
+ # try:
99
+ # claude_text_llm = ClaudeAnswerer()
100
+ # print("✔ Claude (text) initialized for final answers.")
101
+ # except Exception as e:
102
+ # print(f"[ERROR] Failed to init Claude for answers: {e}")
103
+ # print(
104
+ # "Make sure 'anthropic' is installed and ANTHROPIC_API_KEY is set."
105
+ # )
106
+ # continue
107
+
108
+ # while True:
109
+ # q = input("\n[You]: ").strip()
110
+ # if q.lower() in ("exit", "back", "quit"):
111
+ # break
112
+
113
+ # # <<< ADDED: Total question timer >>>
114
+ # total_start = time.time()
115
+
116
+ # # ----- Retrieval Timer -----
117
+ # retrieval_start = time.time()
118
+ # try:
119
+ # results = engine.hybrid_search(q, top_k=7)
120
+ # except Exception as e:
121
+ # print(f"[ERROR] Search failed: {e}")
122
+ # continue
123
+ # retrieval_time = time.time() - retrieval_start
124
+ # print(f"[Retrieved in {retrieval_time:.2f}s]")
125
+
126
+ # if not results:
127
+ # print("No matching content found in the SRD or diagrams.")
128
+ # continue
129
+
130
+ # # Debug: show where the info came from
131
+ # for r in results:
132
+ # src = r["metadata"].get("source")
133
+ # sect = r["metadata"].get("section", "N/A")
134
+ # score = r["score"]
135
+ # print(f" → {src} | section={sect} | score={score:.2f}")
136
+
137
+ # print("\n--- Claude Answer ---")
138
+
139
+ # # ----- Claude answering time -----
140
+ # claude_start = time.time()
141
+ # try:
142
+ # answer = claude_text_llm.generate_answer(q, results)
143
+ # print(answer)
144
+ # except Exception as e:
145
+ # print(f"Claude error: {e}")
146
+ # claude_time = time.time() - claude_start
147
+
148
+ # print(f"\n[Claude Answer Time: {claude_time:.2f}s]")
149
+
150
+ # total_time = time.time() - total_start
151
+ # print(f"[Total Time (Question → Final Answer): {total_time:.2f}s]")
152
+ # print("---------------------")
153
+
154
+ # # ----- EXIT -----
155
+ # elif choice == "3":
156
+ # print("Goodbye.")
157
+ # break
158
+
159
+ # else:
160
+ # print("Invalid choice. Please select 1, 2, or 3.")
161
+
162
+
163
+ # if __name__ == "__main__":
164
+ # main()
165
+ # main2.py
166
+
167
+ import os
168
+ import time
169
+ from dotenv import load_dotenv
170
+
171
+ load_dotenv()
172
+
173
+ from srd_engine_final import SRDChatbotEngine, ClaudeAnswerer
174
+
175
+
176
+ def yes_no(prompt: str, default: bool = False) -> bool:
177
+ """
178
+ Simple [y/N] helper.
179
+ """
180
+ raw = input(prompt).strip().lower()
181
+ if not raw:
182
+ return default
183
+ return raw in ("y", "yes")
184
+
185
+
186
+ def main():
187
+ engine = SRDChatbotEngine()
188
+ claude_text_llm = None # Lazy init (for final answers)
189
+
190
+ print("\n" + "=" * 50)
191
+ print(" ULTIMATE SRD CO-PILOT (Claude + Qwen2-VL)")
192
+ print("=" * 50)
193
+
194
+ while True:
195
+ print("\n1. Index Documents")
196
+ print("2. Ask Question")
197
+ print("3. Exit")
198
+
199
+ choice = input("\nChoose: ").strip()
200
+
201
+ # ----- INDEX -----
202
+ if choice == "1":
203
+ pdf = input("Enter SRD PDF path: ").strip().strip('"')
204
+ if not os.path.exists(pdf):
205
+ print("[ERROR] SRD PDF not found.")
206
+ continue
207
+
208
+ gantt = input("Gantt Chart path (optional): ").strip().strip('"') or None
209
+ cls = input("Class Diagram path (optional): ").strip().strip('"') or None
210
+ seq = input("Sequence Diagram path (optional): ").strip().strip('"') or None
211
+
212
+ print("\nDiagram understanding options:")
213
+
214
+ use_qwen_vision = yes_no(
215
+ "Use Qwen2-VL (free, open-source vision)? (y/N): ", default=False
216
+ )
217
+
218
+ use_claude_vision = yes_no(
219
+ "Also use Claude Vision for diagrams? (y/N): ", default=False
220
+ )
221
+
222
+ if use_qwen_vision and use_claude_vision:
223
+ print("→ Diagrams will be processed by BOTH Qwen2-VL (free) and Claude Vision (paid).")
224
+ elif use_qwen_vision:
225
+ print("→ Diagrams will be processed ONLY by Qwen2-VL (free).")
226
+ elif use_claude_vision:
227
+ print("→ Diagrams will be processed ONLY by Claude Vision.")
228
+ else:
229
+ print("→ No Vision AI selected. Using OCR only (fastest).")
230
+
231
+ try:
232
+ t0 = time.time()
233
+ engine.build_index(
234
+ pdf_path=pdf,
235
+ gantt_path=gantt,
236
+ class_path=cls,
237
+ seq_path=seq,
238
+ use_qwen_vision=use_qwen_vision,
239
+ use_claude_vision=use_claude_vision,
240
+ )
241
+ t1 = time.time()
242
+ print(f"✔ Indexed successfully in {t1 - t0:.2f}s.")
243
+ except Exception as e:
244
+ print(f"[ERROR] Indexing failed: {e}")
245
+
246
+ # ----- CHAT -----
247
+ elif choice == "2":
248
+ if not engine.vectorstore:
249
+ print("Please index documents first (Option 1).")
250
+ continue
251
+
252
+ if claude_text_llm is None:
253
+ try:
254
+ claude_text_llm = ClaudeAnswerer()
255
+ print("✔ Claude (text) initialized for final answers.")
256
+ except Exception as e:
257
+ print(f"[ERROR] Failed to init Claude for answers: {e}")
258
+ print(
259
+ "Make sure 'anthropic' is installed and ANTHROPIC_API_KEY is set."
260
+ )
261
+ continue
262
+
263
+ while True:
264
+ q = input("\n[You]: ").strip()
265
+ if q.lower() in ("exit", "back", "quit"):
266
+ break
267
+
268
+ total_start = time.time()
269
+
270
+ # Retrieval
271
+ retrieval_start = time.time()
272
+ try:
273
+ results = engine.hybrid_search(q, top_k=7)
274
+ except Exception as e:
275
+ print(f"[ERROR] Search failed: {e}")
276
+ continue
277
+ retrieval_time = time.time() - retrieval_start
278
+ print(f"[Retrieved in {retrieval_time:.2f}s]")
279
+
280
+ if not results:
281
+ print("No matching content found in the SRD or diagrams.")
282
+ continue
283
+
284
+ for r in results:
285
+ src = r["metadata"].get("source")
286
+ sect = r["metadata"].get("section", "N/A")
287
+ score = r["score"]
288
+ print(f" → {src} | section={sect} | score={score:.2f}")
289
+
290
+ print("\n--- Claude Answer ---")
291
+
292
+ # Answer
293
+ claude_start = time.time()
294
+ try:
295
+ answer = claude_text_llm.generate_answer(q, results)
296
+ print(answer)
297
+ except Exception as e:
298
+ print(f"Claude error: {e}")
299
+ claude_time = time.time() - claude_start
300
+
301
+ total_time = time.time() - total_start
302
+ print("\n[Timings]")
303
+ print(f" - Retrieval time: {retrieval_time:.2f}s")
304
+ print(f" - Claude answer call time (wrapper): {claude_time:.2f}s")
305
+ print(f" - Total time (Question → Final Answer): {total_time:.2f}s")
306
+ print("---------------------")
307
+
308
+ # ----- EXIT -----
309
+ elif choice == "3":
310
+ print("Goodbye.")
311
+ break
312
+
313
+ else:
314
+ print("Invalid choice. Please select 1, 2, or 3.")
315
+
316
+
317
+ if __name__ == "__main__":
318
+ main()
srd_engine_final.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # srd_engine_final.py
2
+ # ============================================================
3
+ # CedroPass SRD – Final RAG Engine (Stable, Section-Aware)
4
+ # ============================================================
5
+
6
+ import os
7
+ import re
8
+ import io
9
+ import base64
10
+ import time
11
+ import shutil
12
+ import warnings
13
+ from typing import List, Dict, Any, Optional
14
+
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+ # -------------------- Data Processing --------------------
19
+ import pdfplumber
20
+ import camelot
21
+ from pdf2image import convert_from_path
22
+ import pytesseract
23
+ from PIL import Image
24
+
25
+ # -------------------- NLP & Retrieval --------------------
26
+ import spacy
27
+ from sentence_transformers import CrossEncoder
28
+ from langchain_community.vectorstores import Chroma
29
+ from langchain_community.embeddings import HuggingFaceEmbeddings
30
+ from langchain_community.retrievers import BM25Retriever
31
+ from langchain_core.documents import Document
32
+ from rapidfuzz import process as fuzz_process
33
+
34
+ # -------------------- Claude --------------------
35
+ try:
36
+ from anthropic import Anthropic
37
+ except ImportError:
38
+ Anthropic = None
39
+
40
+ # -------------------- CONFIG --------------------
41
+ warnings.filterwarnings("ignore")
42
+ Image.MAX_IMAGE_PIXELS = None
43
+
44
+ POPPLER_PATH = os.getenv("POPPLER_PATH")
45
+ TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
46
+ if os.path.exists(TESSERACT_PATH):
47
+ pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
48
+
49
+ # -------------------- NLP MODEL --------------------
50
+ print("[SYSTEM] Loading NLP pipelines...")
51
+ try:
52
+ NLP_EN = spacy.load("en_core_web_sm")
53
+ except OSError:
54
+ from spacy.cli import download
55
+ download("en_core_web_sm")
56
+ NLP_EN = spacy.load("en_core_web_sm")
57
+
58
+
59
+ # ============================================================
60
+ # TEXT UTILS
61
+ # ============================================================
62
+ def normalize_text(text: str) -> str:
63
+ return re.sub(r"\s+", " ", text).strip()
64
+
65
+
66
+ def lemmatize_text(text: str) -> str:
67
+ doc = NLP_EN(text[:50000])
68
+ return " ".join(
69
+ t.lemma_.lower()
70
+ for t in doc
71
+ if not t.is_space and not t.is_punct
72
+ )
73
+
74
+
75
+ # ============================================================
76
+ # SECTION-AWARE SRD SPLITTER (CRITICAL FIX)
77
+ # ============================================================
78
+ class SmartSRDSplitter:
79
+ """
80
+ Guarantees that ALL child paragraphs inherit the correct
81
+ section_type until a new header appears.
82
+ """
83
+
84
+ HEADER_REGEX = re.compile(
85
+ r"^(\d+(\.\d+)*|FR-\d+|NFR-\d+|[A-Z][A-Za-z\s]{3,}:)",
86
+ re.IGNORECASE,
87
+ )
88
+
89
+ def split_text(self, text: str) -> List[Document]:
90
+ docs: List[Document] = []
91
+ lines = text.splitlines()
92
+
93
+ buffer: List[str] = []
94
+ current_section_title = "General"
95
+ current_section_type = "general"
96
+
97
+ for raw in lines:
98
+ line = raw.strip()
99
+ if not line:
100
+ continue
101
+
102
+ if self.HEADER_REGEX.match(line):
103
+ # Flush previous chunk
104
+ if buffer:
105
+ docs.append(
106
+ Document(
107
+ page_content="\n".join(buffer),
108
+ metadata={
109
+ "type": "text",
110
+ "section": current_section_title,
111
+ "section_type": current_section_type,
112
+ "source": "SRD_Main",
113
+ },
114
+ )
115
+ )
116
+
117
+ buffer = [line]
118
+ current_section_title = line[:80]
119
+
120
+ lowered = line.lower()
121
+ if "functional requirement" in lowered or "fr-" in lowered:
122
+ current_section_type = "functional"
123
+ elif "non-functional" in lowered or "nfr-" in lowered:
124
+ current_section_type = "nonfunctional"
125
+ else:
126
+ current_section_type = "general"
127
+ else:
128
+ buffer.append(line)
129
+
130
+ # Final flush
131
+ if buffer:
132
+ docs.append(
133
+ Document(
134
+ page_content="\n".join(buffer),
135
+ metadata={
136
+ "type": "text",
137
+ "section": current_section_title,
138
+ "section_type": current_section_type,
139
+ "source": "SRD_Main",
140
+ },
141
+ )
142
+ )
143
+
144
+ return docs
145
+
146
+
147
+ # ============================================================
148
+ # PDF EXTRACTORS
149
+ # ============================================================
150
+ def extract_pdf_text(path: str) -> str:
151
+ text = ""
152
+ with pdfplumber.open(path) as pdf:
153
+ for p in pdf.pages:
154
+ t = p.extract_text()
155
+ if t:
156
+ text += t + "\n"
157
+ return text
158
+
159
+
160
+ def extract_tables(path: str) -> List[Document]:
161
+ docs: List[Document] = []
162
+ try:
163
+ tables = camelot.read_pdf(path, pages="all", flavor="stream")
164
+ for i, t in enumerate(tables):
165
+ md = t.df.to_markdown(index=False)
166
+ if len(md) > 30:
167
+ docs.append(
168
+ Document(
169
+ page_content=md,
170
+ metadata={
171
+ "type": "table",
172
+ "section_type": "general",
173
+ "source": "SRD_Table",
174
+ },
175
+ )
176
+ )
177
+ except Exception:
178
+ pass
179
+ return docs
180
+
181
+
182
+ # ============================================================
183
+ # DIAGRAM INTERPRETER (TEXT-ONLY SAFE)
184
+ # ============================================================
185
+ class DiagramInterpreter:
186
+ def __init__(self):
187
+ self.client = (
188
+ Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
189
+ if Anthropic and os.getenv("ANTHROPIC_API_KEY")
190
+ else None
191
+ )
192
+
193
+ def describe(self, image: Image.Image, label: str) -> str:
194
+ if not self.client:
195
+ return pytesseract.image_to_string(image)
196
+
197
+ buf = io.BytesIO()
198
+ image.convert("RGB").save(buf, format="JPEG", quality=85)
199
+ b64 = base64.b64encode(buf.getvalue()).decode()
200
+
201
+ resp = self.client.messages.create(
202
+ model="claude-sonnet-4-5-20250929",
203
+ max_tokens=600,
204
+ temperature=0.2,
205
+ messages=[
206
+ {
207
+ "role": "user",
208
+ "content": [
209
+ {"type": "text", "text": f"Explain this {label} diagram for an SRD."},
210
+ {
211
+ "type": "image",
212
+ "source": {
213
+ "type": "base64",
214
+ "media_type": "image/jpeg",
215
+ "data": b64,
216
+ },
217
+ },
218
+ ],
219
+ }
220
+ ],
221
+ )
222
+ return resp.content[0].text
223
+
224
+
225
+ # ============================================================
226
+ # CORE RAG ENGINE
227
+ # ============================================================
228
+ class SRDChatbotEngine:
229
+ def __init__(self, chroma_dir: str = "chroma_db_final"):
230
+ print("[ENGINE] Initializing retrievers...")
231
+
232
+ self.embedding_model = HuggingFaceEmbeddings(
233
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
234
+ )
235
+ self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
236
+
237
+ self.chroma_dir = chroma_dir
238
+ self.vectorstore: Optional[Chroma] = None
239
+ self.chroma_retriever = None
240
+ self.bm25_retriever: Optional[BM25Retriever] = None
241
+ self.vocab = set()
242
+
243
+ # -------------------- BUILD INDEX --------------------
244
+ def build_index(
245
+ self,
246
+ pdf_path: str,
247
+ diagrams: Optional[List[str]] = None,
248
+ ):
249
+ if os.path.exists(self.chroma_dir):
250
+ shutil.rmtree(self.chroma_dir)
251
+
252
+ splitter = SmartSRDSplitter()
253
+ docs = splitter.split_text(extract_pdf_text(pdf_path))
254
+ docs.extend(extract_tables(pdf_path))
255
+
256
+ for d in docs:
257
+ d.metadata["lemma"] = lemmatize_text(d.page_content)
258
+ for w in d.page_content.split():
259
+ if w.isalnum():
260
+ self.vocab.add(w.lower())
261
+
262
+ self.vectorstore = Chroma.from_documents(
263
+ docs,
264
+ embedding=self.embedding_model,
265
+ persist_directory=self.chroma_dir,
266
+ collection_name="srd_final",
267
+ )
268
+
269
+ self.chroma_retriever = self.vectorstore.as_retriever(search_kwargs={"k": 20})
270
+ self.bm25_retriever = BM25Retriever.from_documents(docs)
271
+ self.bm25_retriever.k = 20
272
+
273
+ print(f"✅ Indexed {len(docs)} SRD chunks")
274
+
275
+ # -------------------- INTENT --------------------
276
+ def detect_intent(self, q: str) -> str:
277
+ q = q.lower()
278
+ if any(w in q for w in ["list", "enumerate", "all functional", "requirements of"]):
279
+ return "enumeration"
280
+ return "qa"
281
+
282
+ # -------------------- ENUMERATION (NO SIM SEARCH) --------------------
283
+ def list_functional_requirements(self) -> List[str]:
284
+ data = self.vectorstore.get(
285
+ where={"section_type": "functional"}
286
+ )
287
+ return data.get("documents", [])
288
+
289
+ # -------------------- QUERY --------------------
290
+ def answer(self, query: str, claude) -> str:
291
+ intent = self.detect_intent(query)
292
+
293
+ if intent == "enumeration":
294
+ items = self.list_functional_requirements()
295
+ if not items:
296
+ return "I could not find sufficient information in the provided SRD."
297
+
298
+ prompt = f"""
299
+ You are a Senior Project Architect.
300
+
301
+ List ALL functional requirements below.
302
+ Do not merge, summarize, or invent anything.
303
+
304
+ REQUIREMENTS:
305
+ {chr(10).join(items)}
306
+ """
307
+ return claude.generate_raw(prompt)
308
+
309
+ # ---------- Normal QA ----------
310
+ dense = self.chroma_retriever.invoke(query)
311
+ sparse = self.bm25_retriever.invoke(query)
312
+
313
+ pool = dense + sparse
314
+ pairs = [[query, d.page_content] for d in pool]
315
+ scores = self.reranker.predict(pairs)
316
+
317
+ top = [
318
+ d.page_content
319
+ for d, s in sorted(zip(pool, scores), key=lambda x: x[1], reverse=True)
320
+ if s > -6
321
+ ][:8]
322
+
323
+ if not top:
324
+ return "I could not find sufficient information in the provided SRD."
325
+
326
+ ctx = "\n---\n".join(top[:4000])
327
+
328
+ prompt = f"""
329
+ Answer using ONLY the SRD context below.
330
+ If unsupported, say so explicitly.
331
+
332
+ CONTEXT:
333
+ {ctx}
334
+
335
+ QUESTION:
336
+ {query}
337
+ """
338
+ return claude.generate_raw(prompt)
339
+
340
+
341
+ # ============================================================
342
+ # CLAUDE ANSWERER
343
+ # ============================================================
344
+ class ClaudeAnswerer:
345
+ def __init__(self):
346
+ if Anthropic is None:
347
+ raise RuntimeError("anthropic not installed")
348
+
349
+ self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
350
+ self.model = "claude-sonnet-4-5-20250929"
351
+
352
+ def generate_raw(self, prompt: str) -> str:
353
+ resp = self.client.messages.create(
354
+ model=self.model,
355
+ max_tokens=1200,
356
+ temperature=0.2,
357
+ messages=[{"role": "user", "content": prompt}],
358
+ )
359
+ return resp.content[0].text
srd_engine_v2.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # srd_engine_v2.py
2
+ import os
3
+ import re
4
+ import io
5
+ import base64
6
+ import hashlib
7
+ from datetime import datetime
8
+ from typing import List, Optional
9
+
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+
13
+ # -------------------- Data Processing --------------------
14
+ import pdfplumber
15
+ import camelot
16
+ from pdf2image import convert_from_path, pdfinfo_from_path
17
+ import pytesseract
18
+ from PIL import Image
19
+
20
+ # -------------------- Vector Store --------------------
21
+ from langchain_community.vectorstores import Chroma
22
+ from langchain_core.documents import Document
23
+
24
+ # -------------------- Claude --------------------
25
+ try:
26
+ from anthropic import Anthropic
27
+ except ImportError:
28
+ Anthropic = None
29
+
30
+ from srd_engine_final import SRDChatbotEngine, ClaudeAnswerer
31
+
32
+ POPPLER_PATH = os.getenv("POPPLER_PATH")
33
+
34
+
35
+ # =====================================================
36
+ # UTILS
37
+ # =====================================================
38
+ def content_hash(text: str) -> str:
39
+ return hashlib.md5(text.encode("utf-8")).hexdigest()
40
+
41
+
42
+ def resize_for_claude(image: Image.Image, max_dim: int = 7900) -> Image.Image:
43
+ w, h = image.size
44
+ if w <= max_dim and h <= max_dim:
45
+ return image
46
+ scale = min(max_dim / w, max_dim / h)
47
+ return image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
48
+
49
+
50
+ # =====================================================
51
+ # SECTION / HEADER DETECTION
52
+ # =====================================================
53
+ SECTION_PATTERNS = {
54
+ "functional": re.compile(r"(functional\s+requirements|FR-\d+)", re.I),
55
+ "nonfunctional": re.compile(r"(non[-\s]?functional\s+requirements|NFR-\d+)", re.I),
56
+ }
57
+
58
+
59
+ def detect_section_type(text: str) -> str:
60
+ for k, pat in SECTION_PATTERNS.items():
61
+ if pat.search(text):
62
+ return k
63
+ return "general"
64
+
65
+
66
+ # =====================================================
67
+ # SRD-AWARE SPLITTER (REQUIREMENT SAFE)
68
+ # =====================================================
69
+ class SmartSRDSplitter:
70
+ HEADER_REGEX = re.compile(
71
+ r"(FR-\d+|NFR-\d+|\d+\.\d+|[A-Z][A-Za-z\s]{3,}:)",
72
+ re.I
73
+ )
74
+
75
+ def split_text(self, text: str) -> List[Document]:
76
+ docs: List[Document] = []
77
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
78
+ buffer: List[str] = []
79
+ current_header = "General"
80
+
81
+ for line in lines:
82
+ if self.HEADER_REGEX.match(line):
83
+ if buffer:
84
+ content = "\n".join(buffer)
85
+ docs.append(
86
+ Document(
87
+ page_content=content,
88
+ metadata={
89
+ "type": "text",
90
+ "header": current_header,
91
+ "section_type": detect_section_type(content),
92
+ },
93
+ )
94
+ )
95
+ buffer = [line]
96
+ current_header = line[:80]
97
+ else:
98
+ buffer.append(line)
99
+
100
+ if buffer:
101
+ content = "\n".join(buffer)
102
+ docs.append(
103
+ Document(
104
+ page_content=content,
105
+ metadata={
106
+ "type": "text",
107
+ "header": current_header,
108
+ "section_type": detect_section_type(content),
109
+ },
110
+ )
111
+ )
112
+
113
+ return docs
114
+
115
+
116
+ # =====================================================
117
+ # DIAGRAM INTERPRETER
118
+ # =====================================================
119
+ class DiagramInterpreter:
120
+ def __init__(self):
121
+ self._anthropic = None
122
+
123
+ def process_image(
124
+ self,
125
+ image: Image.Image,
126
+ label: str,
127
+ use_qwen: bool,
128
+ use_claude: bool
129
+ ) -> str:
130
+ sections: List[str] = []
131
+
132
+ if use_claude:
133
+ if Anthropic is None:
134
+ sections.append("Claude Vision requested but anthropic package is not installed.")
135
+ else:
136
+ if not self._anthropic:
137
+ api_key = os.getenv("ANTHROPIC_API_KEY")
138
+ if not api_key:
139
+ sections.append("Claude Vision requested but ANTHROPIC_API_KEY is not set.")
140
+ else:
141
+ self._anthropic = Anthropic(api_key=api_key)
142
+
143
+ if self._anthropic:
144
+ safe_image = resize_for_claude(image)
145
+ buf = io.BytesIO()
146
+ safe_image.convert("RGB").save(buf, format="JPEG", quality=85)
147
+ b64 = base64.b64encode(buf.getvalue()).decode()
148
+
149
+ resp = self._anthropic.messages.create(
150
+ model=os.getenv("CLAUDE_VISION_MODEL", "claude-sonnet-4-5-20250929"),
151
+ max_tokens=600,
152
+ temperature=0.2,
153
+ messages=[
154
+ {
155
+ "role": "user",
156
+ "content": [
157
+ {"type": "text", "text": f"Explain this {label} diagram for an SRD."},
158
+ {
159
+ "type": "image",
160
+ "source": {"type": "base64", "media_type": "image/jpeg", "data": b64},
161
+ },
162
+ ],
163
+ }
164
+ ],
165
+ )
166
+
167
+ try:
168
+ text0 = resp.content[0].text # type: ignore[attr-defined]
169
+ except Exception:
170
+ text0 = ""
171
+ for block in getattr(resp, "content", []):
172
+ t = getattr(block, "text", None)
173
+ if t:
174
+ text0 += t + "\n"
175
+ text0 = text0.strip()
176
+
177
+ if text0:
178
+ sections.append(text0)
179
+
180
+ if not sections:
181
+ sections.append(pytesseract.image_to_string(image))
182
+
183
+ return "\n\n".join([s for s in sections if s.strip()]).strip()
184
+
185
+
186
+ # =====================================================
187
+ # SMART KNOWLEDGE BASE (MULTI-USER + MULTI-CHAT SAFE)
188
+ # =====================================================
189
+ class SmartKnowledgeBase(SRDChatbotEngine):
190
+ def __init__(self, chroma_dir="chroma_global_db"):
191
+ super().__init__(chroma_dir)
192
+ self.current_project_id: Optional[str] = None
193
+ self.current_chat_id: Optional[str] = None # ✅ NEW
194
+ self.current_user_id: Optional[str] = None # ✅ NEW
195
+
196
+ self.vectorstore = Chroma(
197
+ persist_directory=chroma_dir,
198
+ embedding_function=self.embedding_model,
199
+ collection_name="srd_knowledge"
200
+ )
201
+
202
+ self.interpreter = DiagramInterpreter()
203
+
204
+ # ------------------------------
205
+ # SESSION SCOPING
206
+ # ------------------------------
207
+ def set_current_project(self, name: str):
208
+ self.current_project_id = name.lower().replace(" ", "_")
209
+
210
+ def set_current_chat(self, chat_id: str):
211
+ self.current_chat_id = chat_id
212
+
213
+ def set_current_user(self, user_id: str):
214
+ self.current_user_id = user_id
215
+
216
+ def _require_scope(self):
217
+ if not self.current_project_id:
218
+ raise RuntimeError("Project not set. Call set_current_project(...) first.")
219
+ if not self.current_chat_id:
220
+ raise RuntimeError("Chat not set. Call set_current_chat(...) first.")
221
+ if not self.current_user_id:
222
+ raise RuntimeError("User not set. Call set_current_user(...) first.")
223
+
224
+ def _where_scope(self) -> dict:
225
+ # Chroma where filter (strict isolation)
226
+ return {
227
+ "$and": [
228
+ {"project_id": {"$eq": self.current_project_id}},
229
+ {"chat_id": {"$eq": self.current_chat_id}},
230
+ {"user_id": {"$eq": self.current_user_id}},
231
+ ]
232
+ }
233
+
234
+ # ------------------------------
235
+ # LEARNING / USER CORRECTION
236
+ # ------------------------------
237
+ def learn_from_interaction(self, query: str, correction_text: str):
238
+ self._require_scope()
239
+
240
+ if not correction_text or not correction_text.strip():
241
+ return
242
+
243
+ inferred = detect_section_type(correction_text)
244
+ if inferred == "general":
245
+ inferred = self.detect_requirement_type(query)
246
+
247
+ doc = Document(
248
+ page_content=correction_text.strip(),
249
+ metadata={
250
+ "type": "user_correction",
251
+ "section_type": inferred,
252
+ "project_id": self.current_project_id,
253
+ "chat_id": self.current_chat_id,
254
+ "user_id": self.current_user_id,
255
+ "source": "user_feedback",
256
+ "timestamp": datetime.now().isoformat(),
257
+ "original_query": query,
258
+ "priority": "high",
259
+ },
260
+ )
261
+
262
+ self.vectorstore.add_documents([doc])
263
+ self.vectorstore.persist()
264
+
265
+ # ------------------------------
266
+ # INGESTION
267
+ # ------------------------------
268
+ def process_document_step(self, path, ftype, label, use_qwen, use_claude):
269
+ self._require_scope()
270
+
271
+ docs: List[Document] = []
272
+
273
+ if ftype == "pdf_text":
274
+ with pdfplumber.open(path) as pdf:
275
+ text = "\n".join((p.extract_text() or "") for p in pdf.pages)
276
+
277
+ splitter = SmartSRDSplitter()
278
+ docs = splitter.split_text(text)
279
+
280
+ try:
281
+ tables = camelot.read_pdf(path, pages="all", flavor="stream")
282
+ for t in tables:
283
+ docs.append(
284
+ Document(
285
+ page_content=t.df.to_markdown(),
286
+ metadata={"type": "table", "section_type": "general"},
287
+ )
288
+ )
289
+ except Exception:
290
+ pass
291
+
292
+ elif ftype == "diagram":
293
+ if path.lower().endswith(".pdf"):
294
+ info = pdfinfo_from_path(path, poppler_path=POPPLER_PATH)
295
+ for page in range(1, info["Pages"] + 1):
296
+ imgs = convert_from_path(
297
+ path,
298
+ first_page=page,
299
+ last_page=page,
300
+ dpi=150,
301
+ poppler_path=POPPLER_PATH,
302
+ )
303
+ for img in imgs:
304
+ txt = self.interpreter.process_image(img, label, use_qwen, use_claude)
305
+ docs.append(Document(page_content=txt, metadata={"type": "diagram", "section_type": "general"}))
306
+ else:
307
+ img = Image.open(path)
308
+ txt = self.interpreter.process_image(img, label, use_qwen, use_claude)
309
+ docs.append(Document(page_content=txt, metadata={"type": "diagram", "section_type": "general"}))
310
+
311
+ # ------------------------------
312
+ # Dedup + metadata
313
+ # ------------------------------
314
+ seen = set()
315
+ final_docs: List[Document] = []
316
+
317
+ for d in docs:
318
+ h = content_hash(d.page_content or "")
319
+ if h in seen:
320
+ continue
321
+ seen.add(h)
322
+
323
+ d.metadata["project_id"] = self.current_project_id
324
+ d.metadata["chat_id"] = self.current_chat_id
325
+ d.metadata["user_id"] = self.current_user_id
326
+ d.metadata["timestamp"] = datetime.now().isoformat()
327
+
328
+ final_docs.append(d)
329
+
330
+ if final_docs:
331
+ self.vectorstore.add_documents(final_docs)
332
+ self.vectorstore.persist()
333
+
334
+ return final_docs
335
+
336
+ # ------------------------------
337
+ # INTENT DETECTION
338
+ # ------------------------------
339
+ def detect_intent(self, query: str) -> str:
340
+ q = (query or "").lower()
341
+ if any(w in q for w in ["list", "show all", "enumerate", "give me all", "all of the"]):
342
+ return "enumeration"
343
+ if any(w in q for w in ["explain", "describe", "how", "why", "what is", "what are"]):
344
+ return "explanation"
345
+ return "lookup"
346
+
347
+ # ------------------------------
348
+ # REQUIREMENT TYPE DETECTION
349
+ # ------------------------------
350
+ def detect_requirement_type(self, query: str) -> str:
351
+ q = (query or "").lower()
352
+
353
+ if any(w in q for w in [
354
+ "non functional", "non-functional", "nonfunctional", "nfr", "nfrs",
355
+ "quality attributes", "quality requirements"
356
+ ]):
357
+ return "nonfunctional"
358
+
359
+ if any(w in q for w in [
360
+ "performance", "security", "availability", "reliability", "scalability",
361
+ "usability", "maintainability", "portability", "compliance", "privacy",
362
+ "latency", "throughput", "encryption", "audit", "logging", "backup",
363
+ ]):
364
+ return "nonfunctional"
365
+
366
+ if any(w in q for w in ["functional", "fr-", "frs", "use case", "features"]):
367
+ return "functional"
368
+
369
+ return "functional"
370
+
371
+ # ------------------------------
372
+ # SMART RESPONSE (CHAT-ISOLATED)
373
+ # ------------------------------
374
+ def generate_smart_response(self, query: str, claude: ClaudeAnswerer) -> str:
375
+ self._require_scope()
376
+
377
+ intent = self.detect_intent(query)
378
+
379
+ # =============== ENUMERATION MODE ===============
380
+ if intent == "enumeration":
381
+ req_type = self.detect_requirement_type(query)
382
+
383
+ raw = self.vectorstore.get(
384
+ where={
385
+ "$and": [
386
+ {"project_id": {"$eq": self.current_project_id}},
387
+ {"chat_id": {"$eq": self.current_chat_id}},
388
+ {"user_id": {"$eq": self.current_user_id}},
389
+ {"section_type": {"$eq": req_type}},
390
+ ]
391
+ }
392
+ )
393
+
394
+ docs = raw.get("documents", []) or []
395
+
396
+ if not docs and req_type == "nonfunctional":
397
+ raw2 = self.vectorstore.get(
398
+ where={
399
+ "$and": [
400
+ {"project_id": {"$eq": self.current_project_id}},
401
+ {"chat_id": {"$eq": self.current_chat_id}},
402
+ {"user_id": {"$eq": self.current_user_id}},
403
+ {"section_type": {"$eq": "general"}},
404
+ ]
405
+ }
406
+ )
407
+ docs2 = raw2.get("documents", []) or []
408
+ if docs2:
409
+ docs = docs2
410
+
411
+ if not docs:
412
+ return "I could not find sufficient information in the provided SRD."
413
+
414
+ title = "FUNCTIONAL REQUIREMENTS" if req_type == "functional" else "NON-FUNCTIONAL REQUIREMENTS"
415
+
416
+ prompt = f"""
417
+ You are a Senior Project Architect.
418
+
419
+ Return a COMPLETE numbered list of the {title} found below.
420
+ Do NOT invent items. Do NOT omit items. If duplicates exist, keep only one copy.
421
+
422
+ REQUIREMENTS:
423
+ {chr(10).join(docs)}
424
+ """
425
+ return claude.client.messages.create(
426
+ model=claude.model,
427
+ max_tokens=1400,
428
+ temperature=0.2,
429
+ messages=[{"role": "user", "content": prompt}],
430
+ ).content[0].text
431
+
432
+ # =============== NORMAL QA MODE ===============
433
+ docs = self.vectorstore.similarity_search(
434
+ query,
435
+ k=12,
436
+ filter=self._where_scope(), # ✅ chat + user + project scoped
437
+ )
438
+
439
+ if not docs:
440
+ return "I could not find sufficient information in the provided SRD."
441
+
442
+ ctx = ""
443
+ for d in docs[:8]:
444
+ ctx += f"[{d.metadata.get('header', 'SRD')}]\n{d.page_content}\n---\n"
445
+
446
+ prompt = f"""
447
+ You are a Senior Project Architect.
448
+
449
+ Answer ONLY using the SRD context.
450
+ If unsupported, say so explicitly.
451
+
452
+ CONTEXT:
453
+ {ctx}
454
+
455
+ QUESTION:
456
+ {query}
457
+ """
458
+ return claude.client.messages.create(
459
+ model=claude.model,
460
+ max_tokens=1000,
461
+ temperature=0.3,
462
+ messages=[{"role": "user", "content": prompt}],
463
+ ).content[0].text