Slide 1

Slide 1 text

Semantic search with Django and pgvector PAOLO MELCHIORRE ~ paulox.net

Slide 2

Slide 2 text

No content

Slide 3

Slide 3 text

● PSF, DSF, Python Italia member ● PyCon Italia co-organizer ● Python Pescara organizer ● Django contributor ● Django Girls coach ● Conference speaker Paolo Melchiorre paulox.net © 2022 Bartek Pawlik (CC BY-NC-SA)

Slide 4

Slide 4 text

4 © 1946 William Gottlieb (Public Domain)

Slide 5

Slide 5 text

5 Django “The web framework for perfectionists with deadlines.” Project start in Lawrence Journal-World (2003) Public release and WSGI (PEP 333) support (2005) Django Software Foundation (2008) Python 3 support (2013) ASGI support (2019) Psycopg 3 support (2022)

Slide 6

Slide 6 text

No content

Slide 7

Slide 7 text

7 """Flask quickstart.""" from flask import Flask app = Flask(__name__) @app.route("/") def hello_world(): return "

Hello, World!

" # https://flask.palletsprojects.com/en/latest/quickstart/

Slide 8

Slide 8 text

No content

Slide 9

Slide 9 text

9 """FastAPI first step.""" from fastapi import FastAPI app = FastAPI() @app.get("/") async def root(): return {"message": "Hello World"} # https://fastapi.tiangolo.com/tutorial/first-steps/

Slide 10

Slide 10 text

🧬 μDjango

Slide 11

Slide 11 text

"""μDjango WSGI example.""" from django import conf, http, urls from django.core.handlers import wsgi conf.settings.configure(ROOT_URLCONF=__name__) app = wsgi.WSGIHandler() urlpatterns = [urls.path("", lambda r: http.HttpResponse("🚀"))] # https://github.com/pauloxnet/uDjango 11

Slide 12

Slide 12 text

"""μDjango ASGI example.""" from django import conf, http, urls from django.core.handlers import asgi conf.settings.configure(ROOT_URLCONF=__name__) app = asgi.ASGIHandler() async def root(request): return http.JsonResponse({"message": "Hello World"}) urlpatterns = [urls.path("", root)] # https://github.com/pauloxnet/uDjango 12

Slide 13

Slide 13 text

No content

Slide 14

Slide 14 text

$ # ------------ $ # Requirements $ # ------------ $ $ python3 --version Python 3.12.0 $ python3.12 -m venv ~/.venv $ . ~/.venv/bin/activate $ python -m pip install django ... Successfully installed asgiref-3.8 django-5.0 sqlparse-0.4 14

Slide 15

Slide 15 text

$ # ------------- $ # Start project $ # ------------- $ $ cd ~/projects $ python -m django startproject semanticsearch $ tree --noreport semanticsearch/ semanticsearch/ ├── manage.py └── semanticsearch ├── asgi.py ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py 15

Slide 16

Slide 16 text

16 $ # --------- $ # Start app $ # --------- $ $ cd semanticsearch/ $ python -m django startapp items $ tree --noreport items/ items/ ├── admin.py ├── apps.py ├── __init__.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py └── views.py

Slide 17

Slide 17 text

17 # semanticsearch/semanticsearch/settings.py INSTALLED_APPS = [ "django.contrib.admin", "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", "items", ]

Slide 18

Slide 18 text

18 # semanticsearch/items/models.py from django.db import models class Item(models.Model): content = models.TextField() price = models.IntegerField(db_default=10) in_stock = models.BooleanField(db_default=True)

Slide 19

Slide 19 text

19 $ # --------- $ # Migration $ # --------- $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0001_initial.py - Create model Item $ python -m manage migrate Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK $ python -m manage sqlmigrate items 0001

Slide 20

Slide 20 text

20 BEGIN; -- -- Create model Item -- CREATE TABLE "items_item" ( "id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "content" text NOT NULL, "price" integer DEFAULT 10 NOT NULL, "in_stock" bool DEFAULT 1 NOT NULL ); COMMIT;

Slide 21

Slide 21 text

21 $ python -m manage shell Python 3.12.0 (main, Oct 4 2023) [GCC 13.2.0] on linux Type "help", "copyright", "credits" or "license" for more info. >>> from items.models import Item >>> Item.objects.filter( ... content__icontains="rock", price=10, in_stock=True ... ).order_by( ... "content" ... ).values_list( ... "content" ... )

Slide 22

Slide 22 text

22 -- -- Django ORM generated SQL from SQLite -- SELECT "items_item"."content" FROM "items_item" WHERE ( "items_item"."content" LIKE '%rock%' ESCAPE '\' AND "items_item"."in_stock" AND "items_item"."price" = 10 ) ORDER BY "items_item"."content" ASC;

Slide 23

Slide 23 text

23 # semanticsearch/items/admin.py from django.contrib import admin from items.models import Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): list_display = ["content" , "price", "in_stock"] list_filter = ["price", "in_stock"] search_fields = ["content"] show_facets = admin.ShowFacets.ALWAYS

Slide 24

Slide 24 text

$ # ---------- $ # Run server $ # ---------- $ $ python -m manage createsuperuser $ python -m manage runserver Watching for file changes with StatReloader Performing system checks... System check identified no issues (0 silenced). April 05, 2024 - 19:30:00 Django version 5.0.4, using settings 'semanticsearch.settings' Starting development server at http://127.0.0.1:8000/ Quit the server with CONTROL-C. 24

Slide 25

Slide 25 text

No content

Slide 26

Slide 26 text

No content

Slide 27

Slide 27 text

No content

Slide 28

Slide 28 text

No content

Slide 29

Slide 29 text

No content

Slide 30

Slide 30 text

$ # ---------- $ # Psycopg v3 $ # ---------- $ $ python -m pip install psycopg[binary] ... Successfully installed psycopg-3.1.18 psycopg-binary-3.1.18 typing-extensions-4.10.0 30

Slide 31

Slide 31 text

31 # semanticsearch/semanticsearch/settings.py DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql", "HOST": "", "NAME": "", "PORT": "", "USER": "", } }

Slide 32

Slide 32 text

32 # semanticsearch/semanticsearch/settings.py INSTALLED_APPS = [ "django.contrib.admin", "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", "django.contrib.postgres", "items", ]

Slide 33

Slide 33 text

33 $ python -m manage shell Python 3.12.0 (main, Oct 4 2023) [GCC 13.2.0] on linux Type "help", "copyright", "credits" or "license" for more info. >>> from items.models import Item >>> Item.objects.filter(content__icontains="rocks") >>> Item.objects.filter( ... content__search="rocks" ... ).values_list("content", flat=True)

Slide 34

Slide 34 text

34 -- -- Django ORM generated SQL from PostgreSQL -- SELECT "items_item"."content" FROM "items_item" WHERE to_tsvector( COALESCE("items_item"."content", '') ) @@ ( plainto_tsquery('rocks') );

Slide 35

Slide 35 text

35 # semanticsearch/items/models.py from django.db import models from django.contrib.postgres import search class Item(models.Model): ... vector = models.GeneratedField( db_persist=True, expression=search.SearchVector( "content", config="english" ), output_field=search.SearchVectorField(), )

Slide 36

Slide 36 text

36 $ # --------- $ # Migration $ # --------- $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0002_item_vector.py - Add field vector to item $ python -m manage migrate Operations to perform: Target specific migration: 0002_item_vector, from items Running migrations: Applying items.0002_item_vector... OK $ python -m manage sqlmigrate items 0002

Slide 37

Slide 37 text

37 BEGIN; -- -- Add field vector to item -- ALTER TABLE "items_item" ADD COLUMN "vector" tsvector GENERATED ALWAYS AS ( to_tsvector('english'::regconfig, COALESCE("content", '')) ) STORED; COMMIT;

Slide 38

Slide 38 text

“… improve search accuracy by understanding the searcher's intent and the contextual meaning of terms …” 38 — Wikipedia “Semantic search”

Slide 39

Slide 39 text

39 Embedding System Vector embeddings [ [1,3,4], … ] Data 📸 📄 📽 🎧 Embedding model 🧮

Slide 40

Slide 40 text

No content

Slide 41

Slide 41 text

41 Vector database Open Source licensed

Slide 42

Slide 42 text

42 Vector database PROS Popular Resources CONS Driver Synchronization

Slide 43

Slide 43 text

43 Vector database synchronization Vector database

Slide 44

Slide 44 text

© 2019 Nam Anh (Unsplash Free Use) 44

Slide 45

Slide 45 text

$ # --------------- $ # pgvector-python $ # --------------- $ $ python -m pip install pgvector ... Successfully installed pgvector-0.2.5 45

Slide 46

Slide 46 text

$ # ---------------- $ # vector extension $ # ---------------- $ $ python -m manage makemigrations --empty –name vector items Migrations for 'items': items/migrations/0003_pgvector.py 46

Slide 47

Slide 47 text

47 # items/migrations/0003_pgvector.py from django.db import migrations from pgvector.django import VectorExtension class Migration(migrations.Migration): dependencies = [] operations = [VectorExtension()]

Slide 48

Slide 48 text

48 # semanticsearch/items/models.py from django.db import models from pgvector.django import VectorField class Item(models.Model): ... embedding = VectorField(dimensions=512, editable=False)

Slide 49

Slide 49 text

49 $ # --------- $ # Migration $ # --------- $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0004_item_embedding.py - Add field vector to item $ python -m manage migrate Operations to perform: Target specific migration: 0004_item_embedding, from items Running migrations: Applying items.0004_item_embedding... OK $ python -m manage sqlmigrate items 0004

Slide 50

Slide 50 text

50 BEGIN; -- -- Add field embedding to item -- ALTER TABLE "items_item" ADD COLUMN "embedding" vector(512) NOT NULL); COMMIT;

Slide 51

Slide 51 text

51

Slide 52

Slide 52 text

$ # --------------------- $ # Sentence Transformers $ # --------------------- $ $ python -m pip install sentence-transformers ... Successfully installed ... sentence-transformers-2.6.1 52

Slide 53

Slide 53 text

53 # semanticsearch/items/models.py from django.db import models from pgvector.django import CosineDistance class Item(models.Model): ... @classmethod def search(cls, q, dmax=0.5): distance = CosineDistance("embedding", T.encode(q)) return ( cls.objects.alias(distance=distance) .filter(distance__lt=dmax) .order_by(distance) )

Slide 54

Slide 54 text

54 -- -- Django ORM generated SQL from PostgreSQL -- SELECT "items_item"."id", "items_item"."content", "items_item"."embedding" FROM "items_item" WHERE ("items_item"."embedding" <=> '[...]') < 0.5 ORDER BY ("items_item"."embedding" <=> '[...]') ASC;

Slide 55

Slide 55 text

55 # semanticsearch/items/admin.py from django.contrib import admin from items.models import Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): ... def get_search_results(self, request, ...): queryset, _ = super().get_search_results(request,...) if search_term: queryset |= self.model.search(search_term) return queryset, _

Slide 56

Slide 56 text

No content

Slide 57

Slide 57 text

License This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. CC BY-SA 4.0 CC

Slide 58

Slide 58 text

@[email protected] @pauloxnet @paolomelchiorre @pauloxnet Paolo Melchiorre paulox.net

Slide 59

Slide 59 text

@[email protected] @PythonPescara @python-pescara @PythonPescara Python Pescara pescara.python.it