In recent years, vector embeddings have become the foundation of modern Natural Language Processing (NLP) and semantic search. Rather than searching by keyword, vector databases allow us to compare “meanings” of text based on numerical representations (embeddings).This example demonstrates how to leverage OpenAI Embeddings, Go, and PostgreSQL with the pgvector extension to create a semantic search engine
What Are Embeddings?
An embedding is a numerical representation of text (or other data) as a vector in a high-dimensional space. If two pieces of text are semantically similar, their vectors will be close in that space. By storing embeddings in a database like PostgreSQL (with the pgvector extension), we can perform similarity searches quickly and accurately.
Why PostgreSQL with pgvector?
pgvector is a popular extension that adds vector data types to PostgreSQL. It enables you to:
- Store embeddings as vector columns
- Perform approximate or exact nearest neighbor searches
- Use standard SQL to run queries
Overview of Our Application
- Call OpenAI’s Embedding API to convert input text into vector embeddings.
- Store those embeddings in PostgreSQL using the pgvector extension.
- Query embeddings to find the most semantically similar entries in our database.
Prerequisites
- Go installed (1.19+ recommended).
- PostgreSQL installed and running locally or hosted.
- pgvector extension installed in PostgreSQL. (See pgvector’s GitHub page for installation instructions.)
- An OpenAI API key with access to embeddings.
The Makefile with the tasks related to postgres/pgvector and Docker for local testing.
pgvector:
@docker run -d \
--name pgvector \
-e POSTGRES_USER=admin \
-e POSTGRES_PASSWORD=admin \
-e POSTGRES_DB=vectordb \
-v pgvector_data:/var/lib/postgresql/data \
-p 5432:5432 \
pgvector/pgvector:pg17
psql:
@psql -h localhost -U admin -d vectordb
Make sure you have pgvector installed. Then, in your PostgreSQL database:
CREATE EXTENSION IF NOT EXISTS vector;
The full code
package main
import (
"context"
"fmt"
"log"
"os"
"strings"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/joho/godotenv"
"github.com/sashabaranov/go-openai"
)
func floats32ToString(floats []float32) string {
strVals := make([]string, len(floats))
for i, val := range floats {
// Format each float into a string
strVals[i] = fmt.Sprintf("%f", val)
}
// Join them with comma + space
joined := strings.Join(strVals, ", ")
// pgvector requires bracketed notation for vector input, e.g. [0.1, 0.2, 0.3]
return "[" + joined + "]"
}
func main() {
// Load environment variables
err := godotenv.Load()
if err != nil {
log.Fatal("Error loading .env file")
}
// Create the connection pool
dbpool, err := pgxpool.New(context.Background(), os.Getenv("DATABASE_URL"))
if err != nil {
fmt.Fprintf(os.Stderr, "Unable to create connection pool: %v\n", err)
os.Exit(1)
}
defer dbpool.Close()
// 1. Ensure pgvector extension is enabled
_, err = dbpool.Exec(context.Background(), "CREATE EXTENSION IF NOT EXISTS vector;")
if err != nil {
log.Fatalf("Failed to create extension: %v\n", err)
os.Exit(1)
}
// 2. Create table (if not existing)
createTableSQL := `
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
content TEXT,
embedding vector(1536)
);
`
_, err = dbpool.Exec(context.Background(), createTableSQL)
if err != nil {
log.Fatalf("Failed to create table: %v\n", err)
}
// 3. Create index (if not existing)
createIndexSQL := `
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON documents USING ivfflat (embedding vector_l2_ops) WITH (lists = 100);
`
_, err = dbpool.Exec(context.Background(), createIndexSQL)
if err != nil {
log.Fatalf("Failed to create index: %v\n", err)
}
// 4. Initialize OpenAI client
apiKey := os.Getenv("OPENAI_API_KEY")
if apiKey == "" {
log.Fatal("OPENAI_API_KEY is not set")
}
openaiClient := openai.NewClient(apiKey)
// 5. Insert sample documents
docs := []string{
"PostgreSQL is an advanced open-source relational database.",
"OpenAI provides GPT-based models to generate text embeddings.",
"pgvector allows storing embeddings in a Postgres database.",
}
for _, doc := range docs {
err = insertDocument(context.Background(), dbpool, openaiClient, doc)
if err != nil {
log.Printf("Failed to insert document '%s': %v\n", doc, err)
}
}
// 6. Query for similarity
queryText := "How to store embeddings in Postgres?"
similarDocs, err := searchSimilarDocuments(context.Background(), dbpool, openaiClient, queryText, 5)
if err != nil {
log.Fatalf("Search failed: %v\n", err)
}
fmt.Println("=== Most Similar Documents ===")
for _, doc := range similarDocs {
fmt.Printf("- %s\n", doc)
}
}
// insertDocument generates an embedding for `content` using the OpenAI API
// and inserts it into the documents table.
func insertDocument(ctx context.Context, dbpool *pgxpool.Pool, client *openai.Client, content string) error {
// 1) Get embedding from OpenAI
embedResp, err := client.CreateEmbeddings(ctx, openai.EmbeddingRequest{
Model: openai.AdaEmbeddingV2, // "text-embedding-ada-002"
Input: []string{content},
})
if err != nil {
return fmt.Errorf("CreateEmbeddings API call failed: %w", err)
}
// 2) Convert embedding to bracketed string for pgvector
embedding := embedResp.Data[0].Embedding // []float32
embeddingStr := floats32ToString(embedding)
// 3) Insert into PostgreSQL
insertSQL := `
INSERT INTO documents (content, embedding)
VALUES ($1, $2::vector)
`
_, err = dbpool.Exec(ctx, insertSQL, content, embeddingStr)
if err != nil {
return fmt.Errorf("failed to insert document: %w", err)
}
return nil
}
// searchSimilarDocuments takes a user query, gets the embedding, and returns
// the top-k similar documents based on vector similarity.
func searchSimilarDocuments(ctx context.Context, pool *pgxpool.Pool, client *openai.Client, query string, k int) ([]string, error) {
// 1) Get the embedding for the user’s query via OpenAI
embedResp, err := client.CreateEmbeddings(ctx, openai.EmbeddingRequest{
Model: openai.AdaEmbeddingV2, // "text-embedding-ada-002"
Input: []string{query},
})
if err != nil {
return nil, fmt.Errorf("CreateEmbeddings API call failed: %w", err)
}
// 2) Convert the OpenAI embedding to the bracketed string format for pgvector
queryEmbedding := embedResp.Data[0].Embedding // []float32
queryEmbeddingStr := floats32ToString(queryEmbedding)
// e.g. "[0.123456, 0.789012, ...]"
// 3) Build the SELECT statement that orders by vector similarity
selectSQL := fmt.Sprintf(`
SELECT content
FROM documents
ORDER BY embedding <-> '%s'::vector
LIMIT %d;
`, queryEmbeddingStr, k)
// 4) Run the query
rows, err := pool.Query(ctx, selectSQL)
if err != nil {
return nil, fmt.Errorf("failed to query documents: %w", err)
}
defer rows.Close()
// 5) Read the matching documents
var contents []string
for rows.Next() {
var content string
if err := rows.Scan(&content); err != nil {
return nil, fmt.Errorf("failed to scan row: %w", err)
}
contents = append(contents, content)
}
if err = rows.Err(); err != nil {
return nil, fmt.Errorf("row iteration error: %w", err)
}
return contents, nil
}
Conclusion
OpenAI embeddings, Go, and pgvector in PostgreSQL offer a straightforward solution for building semantic search applications. By representing text as vectors and leveraging the power of database indexing, we move from traditional keyword-based search to searching by context and meaning.
Top comments (0)