8. Examples
PostgreSQL to Pinecone
from vector_etl import create_flow
source = {
"source_data_type": "database",
"db_type": "postgres",
"host": "localhost",
"database_name": "mydb",
"username": "user",
"password": "password",
"port": 5432,
"query": "SELECT * FROM mytable WHERE updated_at > :last_updated_at",
"batch_size": 1000,
"chunk_size": 1000,
"chunk_overlap": 0
}
embedding = {
"embedding_model": "OpenAI",
"api_key": "your-openai-api-key",
"model_name": "text-embedding-ada-002"
}
target = {
"target_database": "Pinecone",
"pinecone_api_key": "your-pinecone-api-key",
"index_name": "my-index",
"dimension": 1536,
"metric": "cosine",
"cloud": "aws",
"region": "us-east-1"
}
embed_columns = [
"column1",
"column2",
"column3"
]
flow = create_flow()
flow.set_source(source)
flow.set_embedding(embedding)
flow.set_target(target)
flow.set_embed_columns(embed_columns)
# Execute the flow
flow.execute()
Dropbox to Weaviate (using Google Gemini embedding)
from vector_etl import create_flow
source = {
"source_data_type": "Dropbox",
"key": "",
"folder_path": "/root/ContextData/",
"file_type": "csv",
"chunk_size": 1000,
"chunk_overlap": 0
}
embedding = {
"embedding_model": "Google Gemini",
"api_key": "my-gemini-api-key",
"model_name": "embedding-001"
}
target = {
"target_database": "Weaviate",
"weaviate_url": "my-clustername.region.gcp.weaviate.cloud",
"weaviate_api_key": "my-weaviate-api-key",
"class_name": "my-weaviate-class-name"
}
embed_columns = [] #Empty Array: File based sources do not require embedding columns
flow = create_flow()
flow.set_source(source)
flow.set_embedding(embedding)
flow.set_target(target)
flow.set_embed_columns(embed_columns)
# Execute the flow
flow.execute()
Importing python configuration into a current application workflow
You can also set up the configurations in one python file as below:
# my_user_script.py
from vector_etl import create_flow
def run_etl_job():
# Define configurations
source = {
"source_data_type": "Local File",
"file_path": "/path/to/your/data/",
"file_type": "csv",
"chunk_size": 1000,
"chunk_overlap": 0
}
embedding = {
"embedding_model": "OpenAI",
"api_key": "your-openai-api-key",
"model_name": "text-embedding-ada-002"
}
target = {
"target_database": "Pinecone",
"pinecone_api_key": "your-pinecone-api-key",
"index_name": "my-index",
}
# Create and configure the flow
flow = create_flow()
flow.set_source(source)
flow.set_embedding(embedding)
flow.set_target(target)
# Execute the flow
flow.execute()
if __name__ == "__main__":
run_etl_job()
and then import it into another python file as below:
# my_current_app.py
import logging
from my_user_script import run_etl_job
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def run_etl():
logger.info("Starting ETL job...")
try:
run_etl_job()
logger.info("ETL job completed successfully.")
except Exception as e:
logger.error(f"An error occurred during the ETL job: {str(e)}")
logger.info("ETL job process ended.")
def rag_query(query):
# Logic for rag query here....file
run_etl = run_etl()
my_query = rag_query(my_query)
Yaml Configuration file examples
PostgreSQL to Pinecone
source:
source_data_type: "database"
db_type: "postgres"
host: "localhost"
database_name: "mydb"
username: "user"
password: "password"
port: 5432
query: "SELECT * FROM mytable WHERE updated_at > :last_updated_at"
batch_size: 1000 #[Optional] Default is 1000
chunk_size: 1000 #[Optional] Default is 1000
chunk_overlap: 0 #[Optional] Default is 0
embedding:
embedding_model: "OpenAI"
api_key: "your-openai-api-key"
model_name: "text-embedding-ada-002"
target:
target_database: "Pinecone"
pinecone_api_key: "your-pinecone-api-key"
index_name: "my-index"
dimension: 1536 #[Optional] Only required if creating a new index
metric: "cosine" #[Optional] Only required if creating a new index
cloud: "aws" #[Optional] Only required if creating a new index
region: "us-east-1" #[Optional] Only required if creating a new index
embed_columns:
- "column1"
- "column2"
- "column3"
Box to Pinecone
source:
source_data_type: "Box"
folder_path: "MyFolder"
file_type: "pdf" #required if folder_path is a directory: Will retrieve all files with filetype
access_token: "your-developer-token" #developer token from Box App console
chunk_size: 1000
chunk_overlap: 200
embedding:
embedding_model: "OpenAI"
api_key: "your-openai-api-key"
model_name: "text-embedding-ada-002"
target:
target_database: "Pinecone"
pinecone_api_key: "your-pinecone-api-key"
index_name: "my-index"
dimension: 1536 #[Optional] Only required if creating a new index
metric: "cosine" #[Optional] Only required if creating a new index
cloud: "aws" #[Optional] Only required if creating a new index
region: "us-east-1" #[Optional] Pinecone will default to us-east-1
embed_columns: [] #Empty Array: File based sources do not require embedding columns
Dropbox to Weaviate (using Google Gemini embedding)
source:
source_data_type: "Dropbox"
key: ''
folder_path: "/root/ContextData/"
file_type: "csv"
chunk_size: 1000
chunk_overlap: 0
embedding:
embedding_model: "Google Gemini"
api_key: "my-gemini-api-key"
model_name: "embedding-001"
target:
target_database: "Weaviate"
weaviate_url: "my-clustername.region.gcp.weaviate.cloud"
weaviate_api_key: "my-weaviate-api-key"
class_name: "my-weaviate-class-name"
embed_columns: [] #Empty Array: File based sources do not require embedding columns
Local file to Pinecone
source:
source_data_type: "Local File"
file_path: "/root/ContextData/"
file_type: "csv"
chunk_size: 1000
chunk_overlap: 0
embedding:
embedding_model: "OpenAI"
api_key: "your-openai-api-key"
model_name: "text-embedding-ada-002"
target:
target_database: "Pinecone"
pinecone_api_key: "your-pinecone-api-key"
index_name: "my-index"
dimension: 1536 #[Optional] Only required if creating a new index
metric: "cosine" #[Optional] Only required if creating a new index
cloud: "aws" #[Optional] Only required if creating a new index
region: "us-east-1" #[Optional] Only required if creating a new index
embed_columns: [] #Empty Array: File based sources do not require embedding columns
Google Cloud Storage (GCS) to Qdrant (suing Cohere)
source:
source_data_type: "Google Cloud Storage"
credentials_path: "/path/to/your/credentials.json"
bucket_name: "myBucket"
prefix: "prefix/"
file_type: "csv" #required if prefix is a directory: Will retrieve all files with filetype
chunk_size: 1000 #[Optional] Default is 1000
chunk_overlap: 0 #[Optional] Default is 0
embedding:
embedding_model: "Cohere"
api_key: "my-cohere-key"
model_name: "embed-english-v3.0"
target:
target_database: "Qdrant"
qdrant_url: "https://your-qdrant-cluster-url.qdrant.io"
qdrant_api_key: "your-qdrant-api-key"
collection_name: "my-collection"
embed_columns: [] #Empty Array: File based sources do not require embedding columns
Amazon S3 to Pinecone
source:
source_data_type: "Amazon S3"
bucket_name: "myBucket"
prefix: "Dir/Subdir/"
file_type: "csv" #required if prefix is a directory: Will retrieve all files with filetype
aws_access_key_id: "your-access-key"
aws_secret_access_key: "your-secret-access-key"
embedding:
embedding_model: "OpenAI"
api_key: "your-openai-api-key"
model_name: "text-embedding-ada-002"
target:
target_database: "Pinecone"
pinecone_api_key: "your-pinecone-api-key"
index_name: "my-index"
dimension: 1536 #[Optional] Only required if creating a new index
metric: "cosine" #[Optional] Only required if creating a new index
cloud: "aws" #[Optional] Only required if creating a new index
region: "us-east-1" #[Optional] Only required if creating a new index
embed_columns: [] #Empty Array: File based sources do not require embedding columns