Slide 1

Slide 1 text

Semantic search with Django, PostgreSQL, & pgvector www.paulox.net PAOLO MELCHIORRE

Slide 2

Slide 2 text

No content

Slide 3

Slide 3 text

● PSF, DSF, Python Italia member ● PyCon Italia co-organizer ● Python Pescara organizer ● Django contributor ● Django Girls coach Paolo Melchiorre paulox.net © 2022 Bartek Pawlik (CC BY-NC-SA)

Slide 4

Slide 4 text

4 © 1946 William Gottlieb (Public Domain)

Slide 5

Slide 5 text

5 Django “The web framework for perfectionists with deadlines.” Project start in Lawrence Journal-World (2003) Public release and WSGI (PEP 333) support (2005) Python 3 support (2013) PostgreSQL module (2015) ASGI support (2019) Psycopg 3 support (2022)

Slide 6

Slide 6 text

No content

Slide 7

Slide 7 text

7 """Flask quickstart.""" from flask import Flask app = Flask(__name__) @app.route("/") def hello_world(): return "

Hello, World!

" # https://flask.palletsprojects.com/en/latest/quickstart/

Slide 8

Slide 8 text

No content

Slide 9

Slide 9 text

9 """FastAPI first step.""" from fastapi import FastAPI app = FastAPI() @app.get("/") async def root(): return {"message": "Hello World"} # https://fastapi.tiangolo.com/tutorial/first-steps/

Slide 10

Slide 10 text

🧬 μDjango

Slide 11

Slide 11 text

"""μDjango WSGI example.""" from django import conf, http, urls from django.core.handlers import wsgi conf.settings.configure(ROOT_URLCONF=__name__) app = wsgi.WSGIHandler() urlpatterns = [urls.path("", lambda r: http.HttpResponse("🚀"))] # https://github.com/pauloxnet/uDjango 11

Slide 12

Slide 12 text

"""μDjango ASGI example.""" from django import conf, http, urls from django.core.handlers import asgi conf.settings.configure(ROOT_URLCONF=__name__) app = asgi.ASGIHandler() async def root(request): return http.JsonResponse({"message": "Hello World"}) urlpatterns = [urls.path("", root)] # https://github.com/pauloxnet/uDjango 12

Slide 13

Slide 13 text

No content

Slide 14

Slide 14 text

$ # ------------ $ # Requirements $ # ------------ $ $ python3 --version Python 3.12.3 $ python3.12 -m venv ~/.venv $ . ~/.venv/bin/activate $ python -m pip install django ... Successfully installed asgiref-3.8.1 django-5.0.6 sqlparse-0.5… 14

Slide 15

Slide 15 text

$ # ------------- $ # Start project $ # ------------- $ $ cd ~/projects $ python -m django startproject semanticsearch $ tree --noreport semanticsearch/ semanticsearch/ ├── manage.py └── semanticsearch ├── asgi.py ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py 15

Slide 16

Slide 16 text

16 $ # --------- $ # Start app $ # --------- $ $ cd semanticsearch/ $ python -m django startapp items $ tree --noreport items/ items/ ├── admin.py ├── apps.py ├── __init__.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py └── views.py

Slide 17

Slide 17 text

17 # semanticsearch/semanticsearch/settings.py INSTALLED_APPS = [ "django.contrib.admin", "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", "items", ]

Slide 18

Slide 18 text

18 # semanticsearch/items/models.py from django.db import models class Item(models.Model): content = models.TextField() price = models.IntegerField(db_default=10) in_stock = models.BooleanField(db_default=True) def __str__(self): return self.content

Slide 19

Slide 19 text

19 $ # --------- $ # Migration $ # --------- $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0001_initial.py - Create model Item $ python -m manage migrate Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK $ python -m manage sqlmigrate items 0001

Slide 20

Slide 20 text

20 BEGIN; -- -- Create model Item -- CREATE TABLE "items_item" ( "id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "content" text NOT NULL, "price" integer DEFAULT 10 NOT NULL, "in_stock" bool DEFAULT 1 NOT NULL ); COMMIT;

Slide 21

Slide 21 text

21 $ python -m manage shell >>> from items.models import Item >>> Item.objects.filter( ... content__icontains="rock", price=10, in_stock=True ... ) , ]>

Slide 22

Slide 22 text

22 -- -- Django ORM generated SQL from SQLite -- SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock" FROM "items_item" WHERE ( "items_item"."content" LIKE '%rock%' ESCAPE '\' AND "items_item"."in_stock" AND "items_item"."price" = 10 );

Slide 23

Slide 23 text

23 # semanticsearch/items/admin.py from django.contrib import admin from items.models import Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): list_display = ["content" , "price", "in_stock"] list_filter = ["price", "in_stock"] search_fields = ["content"] show_facets = admin.ShowFacets.ALWAYS

Slide 24

Slide 24 text

$ # ---------- $ # Run server $ # ---------- $ $ python -m manage createsuperuser $ python -m manage runserver Watching for file changes with StatReloader Performing system checks... System check identified no issues (0 silenced). Django version 5.0.6, using settings 'semanticsearch.settings' Starting development server at http://127.0.0.1:8000/ Quit the server with CONTROL-C. 24

Slide 25

Slide 25 text

No content

Slide 26

Slide 26 text

No content

Slide 27

Slide 27 text

No content

Slide 28

Slide 28 text

No content

Slide 29

Slide 29 text

No content

Slide 30

Slide 30 text

$ # ---------- $ # Psycopg v3 $ # ---------- $ $ python -m pip install psycopg[binary] ... Successfully installed psycopg-3.1.19 psycopg-binary-3.1.19 t… 30

Slide 31

Slide 31 text

31 # semanticsearch/semanticsearch/settings.py DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql", "HOST": "", "NAME": "", "PASSWORD": "", "PORT": "", "USER": "", } }

Slide 32

Slide 32 text

32 # semanticsearch/semanticsearch/settings.py INSTALLED_APPS = [ "django.contrib.admin", "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", "django.contrib.postgres", "items", ]

Slide 33

Slide 33 text

33 $ python -m manage shell >>> from items.models import Item >>> Item.objects.filter(content__icontains="rocks") >>> Item.objects.filter(content__search="rocks") ]>

Slide 34

Slide 34 text

34 -- -- Django ORM generated SQL from PostgreSQL -- SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock" FROM "items_item" WHERE to_tsvector( COALESCE("items_item"."content", '') ) @@ ( plainto_tsquery('rocks') );

Slide 35

Slide 35 text

35 # semanticsearch/items/models.py from django.db import models from django.contrib.postgres import search class Item(models.Model): ... vector = models.GeneratedField( db_persist=True, expression=search.SearchVector( "content", config="english" ), output_field=search.SearchVectorField(), )

Slide 36

Slide 36 text

36 $ # --------- $ # Migration $ # --------- $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0002_item_vector.py - Add field vector to item $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0002_item_vector... OK $ python -m manage sqlmigrate items 0002

Slide 37

Slide 37 text

37 BEGIN; -- -- Add field vector to item -- ALTER TABLE "items_item" ADD COLUMN "vector" tsvector GENERATED ALWAYS AS ( to_tsvector('english'::regconfig, COALESCE("content", '')) ) STORED; COMMIT;

Slide 38

Slide 38 text

“… improve search accuracy by understanding the searcher's intent and the contextual meaning of terms …” 38 — Wikipedia “Semantic search”

Slide 39

Slide 39 text

39 Embedding System Vector embeddings [ [1,3,4], … ] Data 📸 📄 📽 🎧 Embedding model 🧮

Slide 40

Slide 40 text

No content

Slide 41

Slide 41 text

41 Vector database Open Source licensed

Slide 42

Slide 42 text

42 Vector database PROS Popular Resources CONS Driver Synchronization

Slide 43

Slide 43 text

43 Vector database synchronization Vector database

Slide 44

Slide 44 text

© 2019 Nam Anh (Unsplash Free Use) 44

Slide 45

Slide 45 text

$ # --------------- $ # pgvector-python $ # --------------- $ $ python -m pip install pgvector ... Successfully installed pgvector-0.2.5 45

Slide 46

Slide 46 text

$ # ---------------- $ # vector extension $ # ---------------- $ $ python -m manage makemigrations --empty --name vector items Migrations for 'items': items/migrations/0003_pgvector.py 46

Slide 47

Slide 47 text

47 # items/migrations/0003_pgvector.py from django.db import migrations from pgvector.django import VectorExtension class Migration(migrations.Migration): dependencies = [ ('items', '0002_item_vector'), ] operations = [VectorExtension()]

Slide 48

Slide 48 text

48 $ # --------- $ # Migration $ # --------- $ $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0003_pgvector... OK $ python -m manage sqlmigrate items 0003

Slide 49

Slide 49 text

49 BEGIN; -- -- Creates extension vector -- CREATE EXTENSION IF NOT EXISTS "vector"; COMMIT;

Slide 50

Slide 50 text

50 # semanticsearch/items/models.py from django.db import models from pgvector.django import VectorField class Item(models.Model): ... embedding = VectorField(dimensions=512, editable=False)

Slide 51

Slide 51 text

51 $ # --------- $ # Migration $ # --------- $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0004_item_embedding.py - Add field vector to item $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0004_item_embedding... OK $ python -m manage sqlmigrate items 0004

Slide 52

Slide 52 text

52 BEGIN; -- -- Add field embedding to item -- ALTER TABLE "items_item" ADD COLUMN "embedding" vector(512) NOT NULL; COMMIT;

Slide 53

Slide 53 text

53

Slide 54

Slide 54 text

$ # --------------------- $ # Sentence Transformers $ # --------------------- $ $ python -m pip install sentence-transformers … Successfully installed … sentence-transformers-2.7.0 … 54

Slide 55

Slide 55 text

55 # semanticsearch/items/models.py from django.db import models from sentence_transformers import SentenceTransformer T = SentenceTransformer("distiluse-base-multilingual-cased-v1") class Item(models.Model): ... def save(self, *args, **kwargs): self.embedding = T.encode(self.content) super().save(*args, **kwargs)

Slide 56

Slide 56 text

56 $ python -m manage shell >>> from items.models import Item >>> for item in Item.objects.all(): ... item.save()

Slide 57

Slide 57 text

57 # semanticsearch/items/models.py from django.db import models from pgvector.django import CosineDistance class Item(models.Model): ... @classmethod def search(cls, q, dmax=0.5): distance = CosineDistance("embedding", T.encode(q)) return ( cls.objects.alias(distance=distance) .filter(distance__lt=dmax) .order_by(distance) )

Slide 58

Slide 58 text

58 $ python -m manage shell >>> from items.models import Item >>> Item.search("rock") , , , …

Slide 59

Slide 59 text

59 -- -- Django ORM generated SQL from PostgreSQL -- SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock", "items_item"."embedding" FROM "items_item" WHERE ("items_item"."embedding" <=> '[...]') < 0.5 ORDER BY ("items_item"."embedding" <=> '[...]') ASC;

Slide 60

Slide 60 text

60 # semanticsearch/items/admin.py from django.contrib import admin from items.models import Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): ... def get_search_results(self, request, queryset, term): queryset, _ = super().get_search_results( request, queryset, term ) if term: queryset |= self.model.search(term) return queryset, _

Slide 61

Slide 61 text

No content

Slide 62

Slide 62 text

License This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. CC BY-SA 4.0 CC

Slide 63

Slide 63 text

@[email protected] @pauloxnet @paolomelchiorre @pauloxnet Paolo Melchiorre paulox.net