Fully asynchronous queue solution with Kafka

Queue with asyncio and Kafka Showcase Ondřej Veselý Kiwi.com

What kind of data we have

Problem: store JSON to database Just a few records per
second. But • Slow database • Unreliable database • Increasing traffic (20x)

def save_data(conn, cur, ts, data): cur.execute( """INSERT INTO data (timestamp,
data) VALUES (%s,%s) """, (ts, ujson.dumps(data))) conn.commit() @app.route('/store', method=['PUT', 'POST']) def logstash_route(): data = ujson.load(request.body) conn = psycopg2.connect(**config.pg_logs) t = datetime.now() with conn.cursor(cursor_factory=DictCursor) as cur: for d in data: save_data(conn, cur, t, d) conn.close() Old code

Architecture internet Kafka producer /store Kafka consumer Kafka queue Postgres
… time to kill consumer ...

Asyncio, example import asyncio async def factorial(name, number): f =
1 for i in range(2, number+1): print("Task %s: Compute factorial(%s)..." % (name, i)) await asyncio.sleep(1) f *= i print("Task %s: factorial(%s) = %s" % (name, number, f)) loop = asyncio.get_event_loop () tasks = [ asyncio.ensure_future(factorial("A", 2)), asyncio.ensure_future(factorial("B", 3)), asyncio.ensure_future(factorial("C", 4))] loop.run_until_complete (asyncio.gather(*tasks)) loop.close() Task A: Compute factorial( 2)... Task B: Compute factorial( 2)... Task C: Compute factorial( 2)... Task A: factorial( 2) = 2 Task B: Compute factorial( 3)... Task C: Compute factorial( 3)... Task B: factorial( 3) = 6 Task C: Compute factorial( 4)... Task C: factorial( 4) = 24

What we used Apache Kafka Not ujson Concurrency - doing
lots of slow things at once. No processes, no threads. Producer from aiohttp import web import json Consumer import asyncio import json from aiokafka import AIOKafkaConsumer import aiopg

Producer #1 async def kafka_send(kafka_producer , data, topic): message =
{ 'data': data, 'received': str(arrow.utcnow()) } message_json_bytes = bytes(json.dumps(message), 'utf-8') await kafka_producer .send_and_wait(topic, message_json_bytes ) async def handle(request): post_data = await request .json() try: await kafka_send (request.app['kafka_p'], post_data, topic=settings.topic) except: slog.exception("Kafka Error") await destroy_all () return web.Response(status=200) app = web.Application() app.router.add_route('POST', '/store', handle)

Destroying the loop async def destroy_all(): loop = asyncio.get_event_loop ()
for task in asyncio.Task.all_tasks(): task.cancel() await loop .stop() await loop .close() slog.debug("Exiting.") sys.exit() def get_kafka_producer (): loop = asyncio.get_event_loop () producer = AIOKafkaProducer ( loop=loop, bootstrap_servers =settings.queues_urls, request_timeout_ms =settings.kafka_timeout, retry_backoff_ms =1000) loop.run_until_complete (producer.start()) return producer Getting producer Producer #2

Consume … time to resurrect consumer ... DB connected 1.
Receive data record from Kafka 2. Put it to the queue start yes no Flush queue full enough or data old enough Store data from queue to DB yes no Connect to DB start asyncio.Queue() Consumer #1

def main(): dbs_connected = asyncio.Future() batch = asyncio.Queue(maxsize=settings.batch_max_size ) asyncio.ensure_future(consume(batch,
dbs_connected )) asyncio.ensure_future(start_flushing (batch, dbs_connected )) loop.run_forever() async def consume(queue, dbs_connected ): await asyncio .wait_for(dbs_connected, timeout=settings.wait_for_databases ) consumer = AIOKafkaConsumer ( settings .topic, loop=loop, bootstrap_servers =settings.queues_urls, group_id ='consumers' ) await consumer .start() async for msg in consumer: message = json.loads(msg.value.decode("utf-8")) await queue .put((message.get('received'), message.get('data'))) await consumer .stop() Consumer #2

async def start_flushing(queue, dbs_connected): db_logg = await aiopg.create_pool(settings.logs_db_url) while True:
async with db_logg.acquire() as logg_conn, logg_conn.cursor() as logg_cur: await keep_flushing (dbs_connected, logg_cur, queue) await asyncio .sleep(2) async def keep_flushing(dbs_connected, logg_cur, queue): dbs_connected.set_result(True) last_stored_time = time.time() while True: if not queue.empty() and (queue.qsize() > settings.batch_flush_size or time .time() - last_stored_time > settings.batch_max_time): to_store = [] while not queue.empty(): to_store .append(await queue.get()) try: await store_bulk (logg_cur, to_store) except: break # DB down, breaking to reconnect last_stored_time = time.time() await asyncio .sleep(settings.batch_sleep) Consumer #3

Code is public on gitlab https://gitlab.skypicker.com/ondrej/faqstorer www.orwen.org ← this is
me code.kiwi.com ← devs community around Kiwi.com Check graphs...

Fully asynchronous queue solution with Kafka

Fully asynchronous queue solution with Kafka

Moscow Python Meetup PRO

More Decks by Moscow Python Meetup

Other Decks in Programming

Featured

Transcript

Queue with asyncio and Kafka Showcase Ondřej Veselý Kiwi.com

What kind of data we have

Problem: store JSON to database Just a few records per

def save_data(conn, cur, ts, data): cur.execute( """INSERT INTO data (timestamp,

Architecture internet Kafka producer /store Kafka consumer Kafka queue Postgres

Asyncio, example import asyncio async def factorial(name, number): f =

What we used Apache Kafka Not ujson Concurrency - doing

Producer #1 async def kafka_send(kafka_producer , data, topic): message =

Destroying the loop async def destroy_all(): loop = asyncio.get_event_loop ()

Consume … time to resurrect consumer ... DB connected 1.

def main(): dbs_connected = asyncio.Future() batch = asyncio.Queue(maxsize=settings.batch_max_size ) asyncio.ensure_future(consume(batch,

async def start_flushing(queue, dbs_connected): db_logg = await aiopg.create_pool(settings.logs_db_url) while True:

Code is public on gitlab https://gitlab.skypicker.com/ondrej/faqstorer www.orwen.org ← this is