Spaces:

bla
/

tranny

Paused

App Files Files Community

Mbonea commited on Oct 30, 2023

Commit

1fba972

1 Parent(s): f5b43ca

filter conditions

Browse files

Files changed (1) hide show

App/Embedding/utils/Initialize.py +30 -101

App/Embedding/utils/Initialize.py CHANGED Viewed

@@ -1,73 +1,32 @@
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
-from langchain.vectorstores import MongoDBAtlasVectorSearch
-from pymongo import MongoClient
-from motor.motor_asyncio import AsyncIOMotorClient
-import os,pprint
-completion_base = os.environ.get("completion_base")
-openai_api_key = os.environ.get("openai_api_key")
-mongoDB = os.environ.get("MONGO_DB")
-template = """### Given the following context
-### Context
-{context}
-### Use it to explain the question: {question}
- """
-async def fetch_data(question, context):
-    url = completion_base
-    payload = json.dumps(
-        {
-            "messages": [
-                {
-                    "role": "system",
-                    "content": "### You provide explanations based on the provided context",
-                },
-                {
-                    "role": "user",
-                    "content": template.format(context=context, question=question),
-                },
-            ],
-            "model": "gpt-3.5-turbo",
-            "temperature": 1,
-            "presence_penalty": 0,
-            "top_p": 0.95,
-            "frequency_penalty": 0,
-            "stream": False,
-        }
-    )
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {openai_api_key}",
-    }
-    async with aiohttp.ClientSession() as session:
-        async with session.post(url, headers=headers, data=payload) as response:
-            response = await response.json()
-            return response["choices"][0]["message"]["content"]
-async def delete_documents(task_id):
-    client = AsyncIOMotorClient(mongoDB)
-    db = client["transcriptions"]
-    collection = db["videos"]
-    result = await collection.delete_many({"task_id": task_id})
-    print(f"Deleted {result.deleted_count} document(s)")
-# mongo_client = MongoClient(
-#    mongoDB
-# )
-# model_name = "BAAI/bge-base-en"
-# collection = mongo_client["transcriptions"]["videos"]
-# embeddings = HuggingFaceEmbeddings(model_name=model_name)
-# index_name = "test_embeddings"
-# vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)
 def generateChunks(chunks, task_id, n=100):
@@ -91,47 +50,17 @@ def generateChunks(chunks, task_id, n=100):
 def search(query: str, task_id: str):
-    mongo_client = MongoClient(mongoDB)
-    model_name = "BAAI/bge-base-en"
-    collection = mongo_client["transcriptions"]["videos"]
-    embeddings = HuggingFaceEmbeddings(model_name=model_name)
-    index_name = "test_embedding"
-    k = 5
-    vectorstore = MongoDBAtlasVectorSearch(
-        collection,
-        embedding=embeddings,
-        index_name="test_embedding",
-    )
-    data = vectorstore.similarity_search(
-        query=query,
-        pre_filter={"text": {"path": "task_id", "query": task_id}},
-        search_kwargs={
-            "k": k,  # overrequest k during search
-            "pre_filter": {"path": "task_id", "equals": task_id},
-            "post_filter_pipeline": [{"$limit": k}],  # limit results to top k
-        },
-    )
-    # data =[d.dict() for d in data]
-    # print(data[0].metadata.exclude({'_id','embedding'}))
-    # pprint.pprint(data[0].metadata)
-    return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data]
-    # agent =vectorstore.as_retriever(
-    # )
-    # return agent.get_relevant_documents
 def encode(temp: list[Document]):
-    mongo_client = MongoClient(mongoDB)
-    model_name = "BAAI/bge-base-en"
-    collection = mongo_client["transcriptions"]["videos"]
-    embeddings = HuggingFaceEmbeddings(model_name=model_name)
-    index_name = "test_embedding"
-    vectorstore = MongoDBAtlasVectorSearch(
-        collection, embeddings, index_name=index_name
-    )
-    vectorstore.from_documents(
-        temp, embedding=embeddings, collection=collection, index_name=index_name
-    )
-    # return  embeddings.embed_documents(texts = [d.page_content for d in temp])

 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
+from langchain.vectorstores import Pinecone
+import pinecone
+import os
+# get api key from app.pinecone.io
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+# find your environment next to the api key in pinecone console
+PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
+index_name = "transcript-bits"
+model_name = "thenlper/gte-base"
+embeddings = HuggingFaceEmbeddings(model_name=model_name)
+pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
+vector_index = pinecone.Index(index_name=index_name)
+docsearch = Pinecone.from_existing_index(index_name, embeddings)
+async def delete_documents(task_id):
+    docsearch.delete(
+        filter={
+            "task_id": {"$eq": "task_id"},
+        }
+    )
 def generateChunks(chunks, task_id, n=100):
 def search(query: str, task_id: str):
+    filtering_conditions = {
+            "task_id": {"$eq": "task_id"},
+        }
+    data =docsearch.similarity_search(query, k=10, filter=filtering_conditions)
+    return [
+        {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
+        for d in data
+    ]
 def encode(temp: list[Document]):
+    docsearch.add_documents([temp])
+    # return  embeddings.embed_documents(texts = [d.page_content for d in temp])