Upgrade to Pro — share decks privately, control downloads, hide ads and more …

BigQueryでprotobufをパースした話 / parsing protobuf in BigQuery

Masaki Hara
October 10, 2019

BigQueryでprotobufをパースした話 / parsing protobuf in BigQuery

BigQueryはログ等の集計をSQLで行うことができ、JSONを含むような複雑な集計にも対応していますが、protobufというシリアライゼーションフォーマットには対応していません。BigQueryにはJavaScriptでユーザー定義の関数を作る機能があり、これを使ってprotobuf形式のログを集計する機会があったため紹介しています。

Masaki Hara

October 10, 2019
Tweet

More Decks by Masaki Hara

Other Decks in Programming

Transcript

  1. ©2019 Wantedly, Inc. ໊લ௨Γɺ#JH2VFSZ্Ͱ
 Ϣʔβʔఆٛؔ਺Λ࡞Δ͜ͱ͕Ͱ͖Δ +BWB4DSJQU·ͨ͸42-ͰఆٛͰ͖Δ UDF User Defined Function

    BigQuery UDF ˞(PPHMF$MPVE4UPSBHF্ͷ+BWB4DSJQUϥΠϒϥϦΛಡΈࠐΜͩΓɺ #JH2VFSZͷϓϩδΣΫτ্ʹ6%'ΛӬଓԽ͢Δ͜ͱ΋Ͱ͖Δɻ
 ࠓճ͸खܰʹ࢖ͬͨΓॻ͖׵͑ͨΓͰ͖Δ΋ͷΛ࡞Γ͔ͨͬͨͷͰɺ Ұ࣌6%'ͱ࣮ͯ͠૷ͨ͠ɻ
  2. ©2019 Wantedly, Inc. CREATE TEMP FUNCTION parseMessage(messageB64 BYTES) RETURNS STRING

    LANGUAGE js AS """ const toSextet = (ch) => { if (0x41 <= ch && ch < 0x41 + 26) { return ch - 0x41; } else if (0x61 <= ch && ch < 0x71 + 26) { return ch - (0x61 - 26); } else if (0x30 <= ch && ch < 0x30 + 10) { return ch + (52 - 0x30); } else if (ch === 0x2B) { return 62; } else if (ch === 0x2F) { return 63; } else { return 0; } }; const decodeBase64 = (message) => { const numPadded = message.length < 2 ? 0 : ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc. https://gist.github.com/qnighy/79d5eedbd4cf26a573c2cbd09a4b3956
  3. ©2019 Wantedly, Inc. } else { return 0; } };

    const decodeBase64 = (message) => { const numPadded = message.length < 2 ? 0 : message.charCodeAt(message.length - 2) === 0x3D ? 2 : message.charCodeAt(message.length - 1) === 0x3D ? 1 : 0; const numChunks = (message.length / 4) | 0; const ret = new Array(numChunks * 3 - numPadded); for(let i = 0; i < numChunks; ++i) { const chunkPad = i + 1 == numChunks ? numPadded : 0; const ch0 = toSextet(message.charCodeAt(i * 4)); const ch1 = toSextet(message.charCodeAt(i * 4 + 1)); const ch2 = toSextet(message.charCodeAt(i * 4 + 2)); const ch3 = toSextet(message.charCodeAt(i * 4 + 3)); ret[i * 3] = (ch0 << 2) | (ch1 >> 4); if (chunkPad < 2) ret[i * 3 + 1] = ((ch1 & 15) << 4) | (ch2 >> 2); if (chunkPad < 1) ret[i * 3 + 2] = ((ch2 & 3) << 6) | ch3; } return ret; }; class Parser { constructor(bytes, index, limit) { this.bytes = bytes; ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  4. ©2019 Wantedly, Inc. } return ret; }; class Parser {

    constructor(bytes, index, limit) { this.bytes = bytes; this.index = index; this.limit = limit; } eatByte() { if (this.index >= this.limit) { throw `EOF: ${this.index} + 1 > ${this.limit}`; } return this.bytes[this.index++]; } eatBytes(len) { const index = this.index; if (this.index + len > this.limit) { throw `EOF: ${this.index} + ${len} > ${this.limit}`; } this.index += len; return this.bytes.slice(index, index + len); } ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  5. ©2019 Wantedly, Inc. return this.bytes.slice(index, index + len); } subParser(len)

    { const index = this.index; if (index + len > this.limit) { throw `EOF: ${this.index} + ${len} > ${this.limit}`; } this.index += len; return new Parser(this.bytes, index, index + len); } convertToBytes() { const bytes = this.bytes.slice(this.index, this.limit); this.index = this.limit; return bytes; } convertToString() { const bytes = this.convertToBytes(); const bytesString = String.fromCharCode(...bytes); return decodeURIComponent(escape(bytesString)); } eatVarint() { ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  6. ©2019 Wantedly, Inc. return decodeURIComponent(escape(bytesString)); } eatVarint() { let shift

    = 0; let ret = 0; while(true) { const byte = this.eatByte(); ret |= (byte & 127) << shift; if ((byte & 128) == 0) { return ret; } shift += 7; } } eatField() { const tag = this.eatVarint(); const field = tag >> 3; const wireType = tag & 7; let ret; if (wireType === 0) { ret = this.eatVarint(); } else if (wireType === 1) { ret = this.eatBytes(64); ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  7. ©2019 Wantedly, Inc. } eatField() { const tag = this.eatVarint();

    const field = tag >> 3; const wireType = tag & 7; let ret; if (wireType === 0) { ret = this.eatVarint(); } else if (wireType === 1) { ret = this.eatBytes(64); } else if (wireType === 2) { ret = this.subParser(this.eatVarint()); } else if (wireType === 3) { throw "TODO: wireType == 3"; } else if (wireType === 4) { throw "TODO: wireType == 4"; } else if (wireType === 5) { ret = this.eatBytes(32); } else { throw "Unknown wire type"; } return [field, wireType, ret]; } ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  8. ©2019 Wantedly, Inc. } return [field, wireType, ret]; } parseFieldMask()

    { const ret = { paths: [] }; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret.paths.push(fieldData.convertToString()); } } return ret; } parseWorkingHistory() { const ret = {}; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret._op = ["OPERATION_UNSPECIFIED", "CREATE", "UPDATE", "DELETE"] [fieldData] || fieldData; } else if (field === 2) { ret.id = fieldData; } ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  9. ©2019 Wantedly, Inc. [fieldData] || fieldData; } else if (field

    === 2) { ret.id = fieldData; } } return ret; } parseProfile() { const ret = { working_histories: [] }; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret._op = ["OPERATION_UNSPECIFIED", "CREATE", "UPDATE", "DELETE"] [fieldData] || fieldData; } else if (field === 2) { ret.id = fieldData; } else if (field === 17) { ret.working_histories.push(fieldData.parseWorkingHistory()); } } return ret; } parseUser() { ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  10. ©2019 Wantedly, Inc. return ret; } parseUser() { const ret

    = {}; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret._op = ["OPERATION_UNSPECIFIED", "CREATE", "UPDATE", "DELETE"] [fieldData] || fieldData; } else if (field === 2) { ret.id = fieldData; } else if (field === 9) { ret.profile = fieldData.parseProfile(); } } return ret; } parseMessage() { const ret = {}; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret.field_mask = fieldData.parseFieldMask(); ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  11. ©2019 Wantedly, Inc. } return ret; } parseMessage() { const

    ret = {}; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret.field_mask = fieldData.parseFieldMask(); } else if (field === 2) { ret.user = fieldData.parseUser(); } } return ret; } } const messageBytes = decodeBase64(messageB64); const parser = new Parser(messageBytes, 0, messageBytes.length); // try { return JSON.stringify(parser.parseMessage()); // } catch(e) { return JSON.stringify({ error: e }); } """; CREATE TEMP FUNCTION should_perform(message_json STRING) ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  12. ©2019 Wantedly, Inc. } return ret; } } const messageBytes

    = decodeBase64(messageB64); const parser = new Parser(messageBytes, 0, messageBytes.length); // try { return JSON.stringify(parser.parseMessage()); // } catch(e) { return JSON.stringify({ error: e }); } """; CREATE TEMP FUNCTION should_perform(message_json STRING) RETURNS BOOL LANGUAGE js AS """ const fetchCreatedWorkingHistories = (user, field_mask) => { if(!field_mask.paths.find((x) => x === "profile.working_histories._op")) return []; return user.profile.working_histories.filter((w) => w._op === "CREATE"); }; const { user, field_mask } = JSON.parse(message_json); return user.id && fetchCreatedWorkingHistories(user, field_mask).length > 0; """; ͜Ε͕׬੒඼ͩ ©2019 Wantedly, Inc.
  13. ©2019 Wantedly, Inc. όΠφϦͷड͚औΓ #JH2VFSZʹ͸#:5&4ܕ͕͋Δ͕ɺ+BWB4DSJQUʹ౉͞ΕΔͱ͖͸#BTFΤϯίʔυ͞ΕͨTUSJOHͱͯ͠දݱ͞ΕΔɻ ͸͡Ίʹ#BTFͷύʔαʔΛॻ͍ͨɻ Base64 const toSextet =

    (ch) => { if (0x41 <= ch && ch < 0x41 + 26) { return ch - 0x41; } else if (0x61 <= ch && ch < 0x71 + 26) { return ch - (0x61 - 26); } else if (0x30 <= ch && ch < 0x30 + 10) { return ch + (52 - 0x30); } else if (ch === 0x2B) { return 62; } else if (ch === 0x2F) { return 63; } else { return 0; } };
  14. ©2019 Wantedly, Inc. όΠφϦͷड͚औΓ #JH2VFSZʹ͸#:5&4ܕ͕͋Δ͕ɺ+BWB4DSJQUʹ౉͞ΕΔͱ͖͸#BTFΤϯίʔυ͞ΕͨTUSJOHͱͯ͠දݱ͞ΕΔɻ ͸͡Ίʹ#BTFͷύʔαʔΛॻ͍ͨɻ Base64 const decodeBase64 =

    (message) => { const numPadded = message.length < 2 ? 0 : message.charCodeAt(message.length - 2) === 0x3D ? 2 : message.charCodeAt(message.length - 1) === 0x3D ? 1 : 0; const numChunks = (message.length / 4) | 0; const ret = new Array(numChunks * 3 - numPadded); for(let i = 0; i < numChunks; ++i) { const chunkPad = i + 1 == numChunks ? numPadded : 0; const ch0 = toSextet(message.charCodeAt(i * 4)); const ch1 = toSextet(message.charCodeAt(i * 4 + 1)); const ch2 = toSextet(message.charCodeAt(i * 4 + 2)); const ch3 = toSextet(message.charCodeAt(i * 4 + 3)); ret[i * 3] = (ch0 << 2) | (ch1 >> 4); if (chunkPad < 2) ret[i * 3 + 1] = ((ch1 & 15) << 4) | (ch2 >> 2); if (chunkPad < 1) ret[i * 3 + 2] = ((ch2 & 3) << 6) | ch3; }
  15. ©2019 Wantedly, Inc. Protobuf   aY&aYaY   aY&aYaY

       aYaY 08 A4 13 22 03 E3 81 82 22 03 E3 81 84 2D 00 00 00 00 98 03 02 08 00
  16. ©2019 Wantedly, Inc. Protobuf   aY&aYaY   aY&aYaY

       aYaY ωετͨ͠ϝοηʔδ   ωετͨ͠ϝοηʔδ͸ɺಘΒΕͨόΠτྻΛ࠶ؼతʹ8JSF 'PSNBUͱͯ͠ύʔε͢Δ ͜ͱͰಘΒΕΔ
  17. ©2019 Wantedly, Inc. Protobuf ωετͨ͠ϝοηʔδ class Parser { constructor(bytes, index,

    limit) { this.bytes = bytes; this.index = index; this.limit = limit; } eatByte() { if (this.index >= this.limit) { throw `EOF: ${this.index} + 1 > ${this.limit}`; } return this.bytes[this.index++]; } } όΠτྻͷ෦෼ྻΛঢ়ଶ͖ͭͰύʔε͍ͨ͠ͷͰɺCZUFT JOEFY MJNJUͷͭ૊Ͱύʔαʔͷঢ়ଶΛ දݱͨ͠ɻ#JH2VFSZͰ΋Ϋϥε͕࢖͑Δ͜ͱ͕Θ͔ͬͨɻ ࠓճͷίʔυͷߏ੒తʹ͸ɺߦॲཧ͢Δ͝ͱʹΫϥε͕ճੜ੒͞Ε͍ͯΔՄೳੑ͕͋Δɻ
  18. ©2019 Wantedly, Inc. Protobuf ωετͨ͠ϝοηʔδ class Parser { subParser(len) {

    const index = this.index; if (index + len > this.limit) { throw `EOF: ${this.index} + ${len} > ${this.limit}`; } this.index += len; return new Parser(this.bytes, index, index + len); } } ͜ͷߏ੒ʹ͓ͯ͘͠ͱɺωετͨ͠ϝοηʔδͷͨΊͷύʔαʔ͕؆୯ʹऔΓग़ͤΔɻ αϒύʔαʔΛ࢖Θͳ͍ͱ͖͸ͦͷ··ࣺͯΕ͹௿ίετͰଓߦͰ͖Δ
  19. ©2019 Wantedly, Inc. Protobuf 7BSJOUܗࣜ class Parser { eatVarint() {

    let shift = 0; let ret = 0; while(true) { const byte = this.eatByte(); ret |= (byte & 127) << shift; if ((byte & 128) == 0) { return ret; } shift += 7; } } } QSPUPCVGʹग़ͯ͘Δ੔਺ͷ΄ͱΜͲ͸͜ͷܗͰΤϯίʔυ͞Ε͍ͯΔɻ CJU͝ͱʹ۠੾ΓϦτϧΤϯσΟΞϯͰฒ΂͚ͨͩͷ؆୯ͳܗࣜͰɺޙଓͷόΠτ͕͋Δͱ͖͸ CJU໨ΛཱͯΔɻ
  20. ©2019 Wantedly, Inc. Protobuf ϑΟʔϧυܗࣜ 'JFMEº  'JFMEº  7BMVF

    CJU 7BMVF WBSJOU 'JFMEº  7BMVF CJU 'JFMEº  7BMVF CZUFT -FOHUI ͜͜΋WBSJOUܗࣜ
  21. ©2019 Wantedly, Inc. Protobuf   aY&aYaY   aY&aYaY

       aYaY 08 A4 13 22 03 E3 81 82 22 03 E3 81 84 2D 00 00 00 00 98 03 02 08 00
  22. ©2019 Wantedly, Inc. εΩʔϚґଘ෦෼ ඞཁͳϝιουΛ1BSTFSΫϥεʹ࣮૷ͯ͠࢖͏ํࣜʹͨ͠ɻݟΔϑΟʔϧυ΋ඞཁͳ΋ͷ͚ͩɻ XIJMFϧʔϓ಺ͰpFME൪߸ʹԠͯ͡ॲཧΛ͢Δɻ Protobuf class Parser {

    parseProfile() { const ret = { working_histories: [] }; while (this.index < this.limit) { const [field, wireType, fieldData] = this.eatField(); if (field === 1) { ret._op = ["OPERATION_UNSPECIFIED", "CREATE", "UPDATE", "DELETE"][fieldData] || fieldData; } else if (field === 2) { ret.id = fieldData; } else if (field === 17) { ret.working_histories.push(fieldData.parseWorkingHistory()); } } return ret; } }
  23. ©2019 Wantedly, Inc.  +BWB4DSJQU͸૊ࠐΈεΫϦϓτͱͯ͠΋ศར  όϕΒͳͯ͘΋࠷৽ͷ+4͕࢖͑Δ؀ڥ͸ָ͍͠  1SPUPCVGύʔαʔ͸ׂͱ؆୯ʹॻ͚Δ ·ͱΊ

    ͓·͚ʹ#JH2VFSZ42-͕ม਺ɾ܁Γฦ͠ɾϝλϨϕϧͷ෼ذΛ#FUB൛ͱͯ͠αϙʔτͨ͠ͷͰɺ ͜ΕΛ࢖ͬͨύʔαʔ΋ॻ͚Δ͔΋͠Ε·ͤΜ