Spaces:

Mahmous
/

chatbot3

Build error

App Files Files Community

Mahmoud Sayed commited on Oct 26

Commit

7dc8b43

1 Parent(s): 64b4ab7

First

Browse files

Files changed (27) hide show

.env +2 -0
Dockerfile +5 -0
__pycache__/pinecone.cpython-311.pyc +0 -0
api.py +180 -0
data/clean_medquad.py +37 -0
data/coaching_millionaer_dataset.json +0 -0
main.py +32 -0
model/1_Pooling/config.json +10 -0
model/README.md +173 -0
model/config.json +25 -0
model/config_sentence_transformers.json +14 -0
model/model.safetensors +3 -0
model/modules.json +20 -0
model/sentence_bert_config.json +4 -0
model/special_tokens_map.json +37 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +65 -0
model/vocab.txt +0 -0
pinecone_index.py +45 -0
qa/__pycache__/biobert_qa.cpython-311.pyc +0 -0
qa/biobert_qa.py +48 -0
requirements.txt +8 -0
retriever/__pycache__/bm25_retriever.cpython-311.pyc +0 -0
retriever/__pycache__/faiss_retriever.cpython-311.pyc +0 -0
retriever/bm25_retriever.py +48 -0
retriever/faiss_retriever.py +89 -0
retriever/pinecone_retriever.py +22 -0

.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_API_KEY = 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
2	+ PINECONE_API_KEY = pcsk_6FCjSE_FFtwDN4PEY5Q7pqKGqGsNgBQrH2Ut9xWcpr3oe1FA28VDPFqei4XtpXMCwb7zdX

Dockerfile ADDED Viewed

	@@ -0,0 +1,5 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN pip install -r requirements.txt
+CMD ["python", "api.py"]

__pycache__/pinecone.cpython-311.pyc ADDED Viewed

Binary file (1.77 kB). View file

api.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+import traceback
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from dotenv import load_dotenv
+from openai import OpenAI
+from langdetect import detect
+from googletrans import Translator
+from sentence_transformers import SentenceTransformer
+from pinecone import Pinecone
+# ---------- Config ----------
+DATASET_PATH = "data/coaching_millionaer_dataset.json"
+load_dotenv()
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")  # add this to your .env
+PINECONE_INDEX_NAME = "ebook"
+# ---------- App ----------
+app = Flask(__name__)
+CORS(app, resources={r"/ask": {"origins": "*"}})
+# ---------- OpenAI Client ----------
+client = None
+if OPENAI_API_KEY:
+    client = OpenAI(api_key=OPENAI_API_KEY)
+else:
+    print("⚠️  OPENAI_API_KEY is missing in .env")
+# ---------- Retriever ----------
+retriever = None
+try:
+    if not PINECONE_API_KEY:
+        raise ValueError("PINECONE_API_KEY missing in .env")
+    pc = Pinecone(api_key=PINECONE_API_KEY)
+    index = pc.Index(PINECONE_INDEX_NAME)
+    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    class PineconeRetriever:
+        def __init__(self, index, embedder):
+            self.index = index
+            self.embedder = embedder
+        def retrieve(self, query, top_k=10):
+            emb = self.embedder.encode(query).tolist()
+            res = self.index.query(vector=emb, top_k=top_k, include_metadata=True)
+            matches = res.get("matches", [])
+            results = []
+            for match in matches:
+                meta = match.get("metadata", {})
+                results.append({
+                    "context": meta.get("context", ""),
+                    "page": meta.get("page"),
+                    "score": match.get("score", 0)
+                })
+            return results
+    retriever = PineconeRetriever(index, embedder)
+    print("✅ Pinecone retriever initialized successfully.")
+except Exception as e:
+    print("❌ Retriever initialization failed:", e)
+    traceback.print_exc()
+translator = Translator()
+# ---------- Helpers ----------
+def detect_language(question: str) -> str:
+    """Detect the user's language without translation."""
+    try:
+        return detect(question)
+    except Exception:
+        return "unknown"
+def normalize_language(lang: str, text: str) -> str:
+    """Fix incorrect language detection like 'wer is' → German."""
+    if lang == "nl" and any(word in text.lower() for word in ["wer", "was", "wie", "javid", "coaching"]):
+        return "de"
+    return lang
+def system_prompt_book_only() -> str:
+    return (
+        "You are CoachingBot, a professional mentor trained on the book 'Coaching Millionär' by Javid Niazi-Hoffmann. "
+        "Use only the provided book context to answer the question. "
+        "If the user asks about people like Javid Niazi-Hoffmann, describe them factually using the book content. "
+        "Mention page numbers where possible. "
+        "If the context is not relevant, say you don’t have that information in the book and provide a general, helpful answer. "
+        "Always respond in the same language as the user's question, even if the book content is in another language."
+    )
+def system_prompt_fallback() -> str:
+    return (
+        "You are CoachingBot, a helpful business and life mentor. "
+        "The question cannot be answered from the book, so answer using your general coaching knowledge. "
+        "Always respond in the same language as the user's question, even if the book content is in another language. "
+        "Do not invent book citations."
+    )
+def format_answers(question: str, answer: str, results):
+    pages = [f"Seite {r.get('page', '')}" for r in results if r.get("page")]
+    source = ", ".join(pages) if pages else "No source"
+    top_score = max([r.get("score", 0.0) for r in results], default=0.0)
+    return {"answers": [{"question": question, "answer": answer, "source": source, "bm25_score": top_score}]}
+# ---------- Routes ----------
+@app.route("/", methods=["GET"])
+def health():
+    return jsonify({
+        "status": "running",
+        "retriever_ready": bool(retriever),
+        "openai_key_loaded": bool(OPENAI_API_KEY),
+        "pinecone_key_loaded": bool(PINECONE_API_KEY),
+        "index_name": PINECONE_INDEX_NAME
+    })
+@app.route("/ask", methods=["POST", "OPTIONS"])
+def ask():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    try:
+        data = request.get_json(force=True) or {}
+        question = (data.get("question") or "").strip()
+    except Exception:
+        return jsonify(format_answers("", "Invalid JSON request", [])), 200
+    if not question:
+        return jsonify(format_answers("", "Please enter a question.", [])), 200
+    print(f"\n--- User Question ---\n{question}")
+    # Detect and normalize language
+    user_lang = normalize_language(detect_language(question), question)
+    print(f"Detected language: {user_lang}")
+    # Retrieve context
+    context, results = "", []
+    try:
+        raw_results = retriever.retrieve(question)
+        MIN_SCORE = 0.10  # Pinecone similarity scores are normalized (0–1)
+        results = [r for r in raw_results if r.get("score", 0) >= MIN_SCORE]
+        if results:
+            context = "\n\n---\n\n".join(
+                [f"(Seite {r['page']}) {r['context']}" for r in results]
+            )
+    except Exception as e:
+        traceback.print_exc()
+        return jsonify(format_answers(question, f"Retriever error: {e}", [])), 200
+    # Build prompts
+    if context:
+        sys_prompt = system_prompt_book_only()
+        user_content = f"Question: {question}\n\nBook context:\n{context}"
+    else:
+        sys_prompt = system_prompt_fallback()
+        user_content = question
+    # Query GPT
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": sys_prompt},
+                {"role": "user", "content": user_content}
+            ],
+            max_tokens=700,
+        )
+        answer = response.choices[0].message.content.strip()
+    except Exception as e:
+        traceback.print_exc()
+        return jsonify(format_answers(question, f"⚠️ OpenAI call failed: {e}", [])), 200
+    return jsonify(format_answers(question, answer, results))
+# ---------- Run ----------
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 5000))
+    print(f"🚀 Server started on port {port}")
+    app.run(host="0.0.0.0", port=port)

data/clean_medquad.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pandas as pd
+import json
+import os
+# Input and output paths
+input_csv_path = "data/medquad.csv"
+output_json_path = "data/medquad_cleaned.json"
+# Make sure output directory exists
+os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
+# Load CSV
+df = pd.read_csv(input_csv_path)
+# Basic cleaning
+df.dropna(subset=["question", "answer"], inplace=True)
+df["question"] = df["question"].str.strip()
+df["answer"] = df["answer"].str.strip()
+df["source"] = df["source"].fillna("").str.strip()
+df.drop_duplicates(subset=["question", "answer"], inplace=True)
+# Convert to list of dicts
+cleaned_data = [
+    {
+        "title": row["question"],
+        "context": row["answer"],
+        "source": row["source"]
+    }
+    for _, row in df.iterrows()
+]
+# Save as JSON
+with open(output_json_path, "w", encoding="utf-8") as f:
+    json.dump(cleaned_data, f, indent=2)
+print(f"✅ Cleaned data saved to: {output_json_path}")

data/coaching_millionaer_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from retriever.bm25_retriever import BM25Retriever
+from qa.biobert_qa import BioBERTAnswerExtractor
+def main():
+    # Initialize retriever and QA model
+    retriever = BM25Retriever("data/medquad_cleaned.json")
+    qa = BioBERTAnswerExtractor()
+    print("\n🩺 MedBot is ready! Type your question or 'exit' to quit.")
+    while True:
+        question = input("\nAsk a medical question: ").strip()
+        if question.lower() in {"exit", "quit"}:
+            print("👋 Goodbye!")
+            break
+        # Step 1: Retrieve top 3 passages
+        results = retriever.retrieve(question, top_k=3)
+        # Step 2: Run BioBERT on each passage
+        print("\n🔍 Best answers:")
+        for idx, item in enumerate(results, 1):
+            context = item["context"]
+            answer = qa.extract_answer(question, context)
+            print(f"\nResult {idx}")
+            print(f"Q: {item['title']}")
+            print(f"A: {answer}")
+            print(f"Source: {item['source']} (BM25 Score: {item['score']:.2f})")
+if __name__ == "__main__":
+    main()

model/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "word_embedding_dimension": 384,
+    "pooling_mode_cls_token": false,
+    "pooling_mode_mean_tokens": true,
+    "pooling_mode_max_tokens": false,
+    "pooling_mode_mean_sqrt_len_tokens": false,
+    "pooling_mode_weightedmean_tokens": false,
+    "pooling_mode_lasttoken": false,
+    "include_prompt": true
+}

model/README.md ADDED Viewed

	@@ -0,0 +1,173 @@

+---
+language: en
+license: apache-2.0
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_xml
+- ms_marco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+pipeline_tag: sentence-similarity
+---
+# all-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
+contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
+1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+We developed this model during the
+[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
+organized by Hugging Face. We developed this model as part of the project:
+[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
+## Intended uses
+Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
+the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
+By default, input text longer than 256 word pieces is truncated.
+## Training procedure
+### Pre-training
+We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
+We then apply the cross entropy loss by comparing with true pairs.
+#### Hyper parameters
+We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
+We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
+a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
+We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,170,060,424** |

model/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.56.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "__version__": {
+    "sentence_transformers": "5.1.1",
+    "transformers": "4.56.2",
+    "pytorch": "2.8.0+cpu"
+  },
+  "model_type": "SentenceTransformer",
+  "prompts": {
+    "query": "",
+    "document": ""
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1377e9af0ca0b016a9f2aa584d6fc71ab3ea6804fae21ef9fb1416e2944057ac
+size 90864192

model/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

model/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "max_seq_length": 256,
+    "do_lower_case": false
+}

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 256,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pinecone_index.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from sentence_transformers import SentenceTransformer
+from pinecone import Pinecone
+import json
+# === Load dataset ===
+with open("data/coaching_millionaer_dataset.json", "r", encoding="utf-8") as f:
+    docs = json.load(f)
+# === Init embedding model ===
+model = SentenceTransformer("./model")
+# === Init Pinecone ===
+pc = Pinecone(api_key="pcsk_6FCjSE_FFtwDN4PEY5Q7pqKGqGsNgBQrH2Ut9xWcpr3oe1FA28VDPFqei4XtpXMCwb7zdX")
+index = pc.Index("ebook")
+# === Upload data ===
+vectors = []
+for i, doc in enumerate(docs):
+    # Handle multiple possible content keys safely
+    content = (
+        doc.get("content")
+        or doc.get("text")
+        or doc.get("context")
+        or doc.get("paragraph")
+    )
+    if not content:
+        print(f"⚠️ Skipping item {i} (no text field found)")
+        continue
+    emb = model.encode(content).tolist()
+    vectors.append((str(i), emb, {"page": doc.get("page"), "context": content}))
+    # Upload in batches
+    if len(vectors) >= 100:
+        index.upsert(vectors=vectors)
+        print(f"✅ Uploaded {i + 1} documents...")
+        vectors = []
+# Upload remaining
+if vectors:
+    index.upsert(vectors=vectors)
+print("🎉 Upload complete! All documents added to Pinecone.")

qa/__pycache__/biobert_qa.cpython-311.pyc ADDED Viewed

Binary file (2.94 kB). View file

qa/biobert_qa.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+import torch
+class BioBERTAnswerExtractor:
+    def __init__(self, model_name='dmis-lab/biobert-base-cased-v1.1-squad'):
+        print("⏳ Loading BioBERT model...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+        print("BioBERT model loaded.")
+    def extract_answer(self, question, context):
+        inputs = self.tokenizer.encode_plus(
+            question, context,
+            return_tensors='pt',
+            truncation=True,
+            max_length=512
+        )
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            start_scores = outputs.start_logits
+            end_scores = outputs.end_logits
+        start_idx = torch.argmax(start_scores)
+        end_idx = torch.argmax(end_scores)
+        if start_idx > end_idx:
+            return ""  # invalid span
+        all_tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+        answer_tokens = all_tokens[start_idx:end_idx + 1]
+        answer = self.tokenizer.convert_tokens_to_string(answer_tokens).strip()
+        # Filter out junk answers
+        if not answer or answer.lower() in ["[cls]", "[sep]"] or len(answer) < 3:
+            return ""  # signal to use fallback
+        return answer
+# Example usage
+if __name__ == "__main__":
+    qa = BioBERTAnswerExtractor()
+    question = "What are the symptoms of flu?"
+    context = "The flu can cause fever, cough, sore throat, muscle aches, fatigue, and chills."
+    answer = qa.extract_answer(question, context)
+    print(f"Answer: {answer}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+flask
+flask-cors
+sentence-transformers
+pinecone-client
+langdetect
+googletrans==4.0.0-rc1
+openai
+python-dotenv

retriever/__pycache__/bm25_retriever.cpython-311.pyc ADDED Viewed

Binary file (3.94 kB). View file

retriever/__pycache__/faiss_retriever.cpython-311.pyc ADDED Viewed

Binary file (6.84 kB). View file

retriever/bm25_retriever.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import json
+from rank_bm25 import BM25Okapi
+import nltk
+from nltk.tokenize import word_tokenize
+nltk.download('punkt')
+class BM25Retriever:
+    def __init__(self, json_path):
+        self.data = self.load_data(json_path)
+        self.contexts = [item["context"] for item in self.data]
+        self.tokenized_corpus = [word_tokenize(doc.lower()) for doc in self.contexts]
+        self.bm25 = BM25Okapi(self.tokenized_corpus)
+    def load_data(self, path):
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    def retrieve(self, query, top_k=5):
+        tokenized_query = word_tokenize(query.lower())
+        scores = self.bm25.get_scores(tokenized_query)
+        top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
+        results = []
+        for i in top_k_indices:
+            item = self.data[i]
+            results.append({
+                "score": scores[i],
+                "title": item["title"],
+                "context": item["context"],
+                "source": item.get("source", "")
+            })
+        return results
+# Example usage:
+if __name__ == "__main__":
+    retriever = BM25Retriever("data/medquad_cleaned.json")
+    question = input("Ask a medical question: ")
+    results = retriever.retrieve(question)
+    for idx, result in enumerate(results, 1):
+        print(f"\nResult {idx}")
+        print(f"Score: {result['score']:.2f}")
+        print(f"Question: {result['title']}")
+        print(f"Answer: {result['context']}")
+        print(f"Source: {result['source']}")

retriever/faiss_retriever.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+import os
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from sklearn.preprocessing import normalize
+class FAISSRetriever:
+    def __init__(self, data_path="data/coaching_millionaer_dataset.json"):
+        """
+        Multilingual FAISS retriever for the 'Coaching Millionär' dataset.
+        Supports English and German queries.
+        """
+        self.data_path = data_path
+        self.index_path = "data/faiss_index.bin"
+        self.meta_path = "data/faiss_metadata.json"
+        # ✅ multilingual model (English + German + 50+ languages)
+        self.model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+        # optional reranker for better precision
+        self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+        # Load existing FAISS index or build new one
+        if os.path.exists(self.index_path) and os.path.exists(self.meta_path):
+            self.index = faiss.read_index(self.index_path)
+            with open(self.meta_path, "r", encoding="utf-8") as f:
+                self.metadata = json.load(f)
+            print("✅ Loaded existing FAISS index.")
+        else:
+            self._build_index()
+    def _build_index(self):
+        """Build and save FAISS index from dataset."""
+        with open(self.data_path, "r", encoding="utf-8") as f:
+            dataset = json.load(f)
+        texts = [item["text"] for item in dataset]
+        embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
+        embeddings = normalize(embeddings)
+        self.index = faiss.IndexFlatIP(embeddings.shape[1])
+        self.index.add(embeddings)
+        self.metadata = dataset
+        os.makedirs("data", exist_ok=True)
+        faiss.write_index(self.index, self.index_path)
+        with open(self.meta_path, "w", encoding="utf-8") as f:
+            json.dump(self.metadata, f, ensure_ascii=False)
+        print(f"✅ Built new FAISS index from {len(texts)} passages.")
+    def retrieve(self, question, top_k=10):
+        """
+        Retrieve relevant passages from the FAISS index.
+        Automatically boosts results mentioning key entities like 'Javid Niazi-Hoffmann'.
+        """
+        query_vec = self.model.encode([question], convert_to_numpy=True)
+        query_vec = normalize(query_vec)
+        scores, indices = self.index.search(query_vec, top_k)
+        results = []
+        # small keyword boost for known entities
+        boost_keywords = ["Javid", "Niazi", "Hoffmann", "Coaching", "Millionär"]
+        for idx, score in zip(indices[0], scores[0]):
+            if idx < len(self.metadata):
+                item = self.metadata[idx]
+                text = item["text"]
+                boost = any(k.lower() in text.lower() for k in boost_keywords)
+                final_score = float(score * 100 + (5 if boost else 0))
+                results.append({
+                    "page": item.get("page", ""),
+                    "context": text,
+                    "score": final_score
+                })
+        # ✅ Rerank using cross-encoder for higher accuracy
+        if results:
+            pairs = [(question, r["context"]) for r in results]
+            rerank_scores = self.reranker.predict(pairs)
+            results = [
+                {**r, "rerank_score": float(s)}
+                for r, s in zip(results, rerank_scores)
+            ]
+            results = sorted(results, key=lambda x: x["rerank_score"], reverse=True)[:top_k]
+        return results

retriever/pinecone_retriever.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from pinecone import Pinecone
+from sentence_transformers import SentenceTransformer
+class PineconeRetriever:
+    def __init__(self, api_key: str, index_name: str):
+        self.model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2")
+        self.pinecone = Pinecone(api_key=api_key)
+        self.index = self.pinecone.Index(index_name)
+    def retrieve(self, query: str, top_k: int = 5):
+        query_emb = self.model.encode(query).tolist()
+        results = self.index.query(vector=query_emb, top_k=top_k, include_metadata=True)
+        matches = results.get("matches", [])
+        docs = []
+        for match in matches:
+            meta = match["metadata"]
+            docs.append({
+                "content": meta.get("context", ""),
+                "page": meta.get("page"),
+                "score": match.get("score")
+            })
+        return docs