## 8. Examples ### PostgreSQL to Pinecone ```python from vector_etl import create_flow source = { "source_data_type": "database", "db_type": "postgres", "host": "localhost", "database_name": "mydb", "username": "user", "password": "password", "port": 5432, "query": "SELECT * FROM mytable WHERE updated_at > :last_updated_at", "batch_size": 1000, "chunk_size": 1000, "chunk_overlap": 0 } embedding = { "embedding_model": "OpenAI", "api_key": "your-openai-api-key", "model_name": "text-embedding-ada-002" } target = { "target_database": "Pinecone", "pinecone_api_key": "your-pinecone-api-key", "index_name": "my-index", "dimension": 1536, "metric": "cosine", "cloud": "aws", "region": "us-east-1" } embed_columns = [ "column1", "column2", "column3" ] flow = create_flow() flow.set_source(source) flow.set_embedding(embedding) flow.set_target(target) flow.set_embed_columns(embed_columns) # Execute the flow flow.execute() ``` ### Dropbox to Weaviate (using Google Gemini embedding) ```python from vector_etl import create_flow source = { "source_data_type": "Dropbox", "key": "", "folder_path": "/root/ContextData/", "file_type": "csv", "chunk_size": 1000, "chunk_overlap": 0 } embedding = { "embedding_model": "Google Gemini", "api_key": "my-gemini-api-key", "model_name": "embedding-001" } target = { "target_database": "Weaviate", "weaviate_url": "my-clustername.region.gcp.weaviate.cloud", "weaviate_api_key": "my-weaviate-api-key", "class_name": "my-weaviate-class-name" } embed_columns = [] #Empty Array: File based sources do not require embedding columns flow = create_flow() flow.set_source(source) flow.set_embedding(embedding) flow.set_target(target) flow.set_embed_columns(embed_columns) # Execute the flow flow.execute() ``` ### Importing python configuration into a current application workflow You can also set up the configurations in one python file as below: ```python # my_user_script.py from vector_etl import create_flow def run_etl_job(): # Define configurations source = { "source_data_type": "Local File", "file_path": "/path/to/your/data/", "file_type": "csv", "chunk_size": 1000, "chunk_overlap": 0 } embedding = { "embedding_model": "OpenAI", "api_key": "your-openai-api-key", "model_name": "text-embedding-ada-002" } target = { "target_database": "Pinecone", "pinecone_api_key": "your-pinecone-api-key", "index_name": "my-index", } # Create and configure the flow flow = create_flow() flow.set_source(source) flow.set_embedding(embedding) flow.set_target(target) # Execute the flow flow.execute() if __name__ == "__main__": run_etl_job() ``` and then import it into another python file as below: ```python # my_current_app.py import logging from my_user_script import run_etl_job logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def run_etl(): logger.info("Starting ETL job...") try: run_etl_job() logger.info("ETL job completed successfully.") except Exception as e: logger.error(f"An error occurred during the ETL job: {str(e)}") logger.info("ETL job process ended.") def rag_query(query): # Logic for rag query here....file run_etl = run_etl() my_query = rag_query(my_query) ``` ### Yaml Configuration file examples #### PostgreSQL to Pinecone ```yaml source: source_data_type: "database" db_type: "postgres" host: "localhost" database_name: "mydb" username: "user" password: "password" port: 5432 query: "SELECT * FROM mytable WHERE updated_at > :last_updated_at" batch_size: 1000 #[Optional] Default is 1000 chunk_size: 1000 #[Optional] Default is 1000 chunk_overlap: 0 #[Optional] Default is 0 embedding: embedding_model: "OpenAI" api_key: "your-openai-api-key" model_name: "text-embedding-ada-002" target: target_database: "Pinecone" pinecone_api_key: "your-pinecone-api-key" index_name: "my-index" dimension: 1536 #[Optional] Only required if creating a new index metric: "cosine" #[Optional] Only required if creating a new index cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index embed_columns: - "column1" - "column2" - "column3" ``` #### Box to Pinecone ```yaml source: source_data_type: "Box" folder_path: "MyFolder" file_type: "pdf" #required if folder_path is a directory: Will retrieve all files with filetype access_token: "your-developer-token" #developer token from Box App console chunk_size: 1000 chunk_overlap: 200 embedding: embedding_model: "OpenAI" api_key: "your-openai-api-key" model_name: "text-embedding-ada-002" target: target_database: "Pinecone" pinecone_api_key: "your-pinecone-api-key" index_name: "my-index" dimension: 1536 #[Optional] Only required if creating a new index metric: "cosine" #[Optional] Only required if creating a new index cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Pinecone will default to us-east-1 embed_columns: [] #Empty Array: File based sources do not require embedding columns ``` #### Dropbox to Weaviate (using Google Gemini embedding) ```yaml source: source_data_type: "Dropbox" key: '' folder_path: "/root/ContextData/" file_type: "csv" chunk_size: 1000 chunk_overlap: 0 embedding: embedding_model: "Google Gemini" api_key: "my-gemini-api-key" model_name: "embedding-001" target: target_database: "Weaviate" weaviate_url: "my-clustername.region.gcp.weaviate.cloud" weaviate_api_key: "my-weaviate-api-key" class_name: "my-weaviate-class-name" embed_columns: [] #Empty Array: File based sources do not require embedding columns ``` #### Local file to Pinecone ```yaml source: source_data_type: "Local File" file_path: "/root/ContextData/" file_type: "csv" chunk_size: 1000 chunk_overlap: 0 embedding: embedding_model: "OpenAI" api_key: "your-openai-api-key" model_name: "text-embedding-ada-002" target: target_database: "Pinecone" pinecone_api_key: "your-pinecone-api-key" index_name: "my-index" dimension: 1536 #[Optional] Only required if creating a new index metric: "cosine" #[Optional] Only required if creating a new index cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index embed_columns: [] #Empty Array: File based sources do not require embedding columns ``` #### Google Cloud Storage (GCS) to Qdrant (suing Cohere) ```yaml source: source_data_type: "Google Cloud Storage" credentials_path: "/path/to/your/credentials.json" bucket_name: "myBucket" prefix: "prefix/" file_type: "csv" #required if prefix is a directory: Will retrieve all files with filetype chunk_size: 1000 #[Optional] Default is 1000 chunk_overlap: 0 #[Optional] Default is 0 embedding: embedding_model: "Cohere" api_key: "my-cohere-key" model_name: "embed-english-v3.0" target: target_database: "Qdrant" qdrant_url: "https://your-qdrant-cluster-url.qdrant.io" qdrant_api_key: "your-qdrant-api-key" collection_name: "my-collection" embed_columns: [] #Empty Array: File based sources do not require embedding columns ``` #### Amazon S3 to Pinecone ```yaml source: source_data_type: "Amazon S3" bucket_name: "myBucket" prefix: "Dir/Subdir/" file_type: "csv" #required if prefix is a directory: Will retrieve all files with filetype aws_access_key_id: "your-access-key" aws_secret_access_key: "your-secret-access-key" embedding: embedding_model: "OpenAI" api_key: "your-openai-api-key" model_name: "text-embedding-ada-002" target: target_database: "Pinecone" pinecone_api_key: "your-pinecone-api-key" index_name: "my-index" dimension: 1536 #[Optional] Only required if creating a new index metric: "cosine" #[Optional] Only required if creating a new index cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index embed_columns: [] #Empty Array: File based sources do not require embedding columns ```