tl;dr - Yes, it's possible, but it's really slow (at least this way of doing it is).
Here's the steps to pull this off:
ARRAY
vector_cosine_similarity_vectorized
If you don't already have a dataset you want to use, this one from Kaggle is good.
Create a table in Snowflake:
CREATE TABLE JEOPARDY_EMBEDDINGS (
show_number INT,
air_date STRING,
round STRING,
category STRING,
value STRING,
question STRING,
answer STRING,
embedding ARRAY
);
Get embeddings for each row using the OpenAI API (we're only embedding the question column, but you could easily mash the columns together):
import requests
import json
import pandas as pd
import openai
openai.api_key = OPENAI_API_KEY
rows = jep_df.to_dict(orient='records')
for row in rows:
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=[row['question']],
)
row['embedding'] = response["data"][0]["embedding"]
Insert the embeddings into Snowflake:
parquet_file = 'temp_dataframe.parquet'
load_to_snow.to_parquet(parquet_file, index=False)
conn = snowflake.connector.connect(
account=account,
user=user,
password=password,
database=database,
schema=schema,
warehouse=warehouse,
role=role
)
cur = conn.cursor()
# Create or use an existing internal stage
stage_name = 'TEMP_INTERNAL_STAGE'
cur.execute(f"CREATE OR REPLACE STAGE {stage_name}")
# Upload the Parquet file to the stage
cur.execute(f"PUT file://{parquet_file} @{stage_name}")
# Bulk load the data from the stage to a Snowflake table
table_name = 'JEOPARDY_EMBEDDINGS'
cur.execute(f"COPY INTO {table_name} FROM @{stage_name} FILE_FORMAT=(TYPE='PARQUET')")
cur.close()
conn.close()
Create a Python UDF called vector_cosine_similarity_vectorized
:
CREATE OR REPLACE FUNCTION vector_cosine_similarity_vectorized(A ARRAY, B ARRAY)
RETURNS FLOAT
LANGUAGE PYTHON
RUNTIME_VERSION = 3.8
PACKAGES = ('pandas', 'numpy')
HANDLER = 'vector_cosine_similarity_vectorized'
AS $$
import pandas as pd
import numpy as np
from _snowflake import vectorized
@vectorized(input=pd.DataFrame)
def vector_cosine_similarity_vectorized(df):
# Extract first and second columns
A_series = df.iloc[:, 0].apply(np.array)
B_series = df.iloc[:, 1].apply(np.array)
# Compute cosine similarity for each pair of vectors
dot_product = A_series.apply(lambda a: np.dot(a, B_series.loc[A_series.index[0]]))
norm_A = A_series.apply(np.linalg.norm)
norm_B = B_series.apply(np.linalg.norm)
similarity = dot_product / (norm_A * norm_B)
return similarity
$$;
Query the embeddings:
query_sentence = '''what island had a really big monkey?'''
# create an embedding for the query sentence
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=[query_sentence]
)
query_embedding = response["data"][0]["embedding"]
conn = snowflake.connector.connect(
account=account,
user=user,
password=password,
database=database,
schema=schema,
warehouse=warehouse,
role=role
)
cur = conn.cursor()
search_vec = str(query_embedding)
search_vec = search_vec.replace('[', '').replace(']', '')
sql = f'''
SELECT
j.*,
vector_cosine_similarity_vectorized(ARRAY_CONSTRUCT({search_vec}), j.EMBEDDING) as similarity
FROM
public.JEOPARDY_EMBEDDINGS as j
order by similarity desc
LIMIT 10;
'''
cur.execute(sql)
rows = cur.fetchall()
columns = [col[0] for col in cur.description]
df = pd.DataFrame(rows, columns=columns)
cur.close()
conn.close()
df.head()
Get the new standard in analytics. Sign up below or get in touch and we’ll set you up in under 30 minutes.