From 21a02564e3da5f1a6dfe51d26359f848faf31dd7 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 19 Dec 2025 09:11:37 -0500 Subject: [PATCH 1/3] Update dependencies --- Pipfile.lock | 358 +++++++++++++++++++++++++-------------------------- 1 file changed, 178 insertions(+), 180 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index d52d52b..bd7cc81 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -27,20 +27,20 @@ }, "boto3": { "hashes": [ - "sha256:c5cb2ada690c14e2dfa1e1c59ef7ef399c5e381f5514f1541d28310e35192300", - "sha256:eda49046c0f6a21ac159f9b2d609e5cc70d1dd019b7ac9618eec99285282b3db" + "sha256:4c9a62dcb5c3f905630fe99fb4b81131da84c5c92eedcc81a89cbd924c1c524f", + "sha256:9d6aad3fa8b90567006bf7b32efa26489fc306fbe63946eaf57b72356a45761d" ], "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "botocore": { "hashes": [ - "sha256:92128d56654342f026d5c20a92bf0e8b546be1eb38df2c0efc7433e8bbc39045", - "sha256:cc401b4836eae2a781efa1d1df88b2e92f9245885a6ae1bf9a6b26bc97b3efd2" + "sha256:7e4cf14bd5719b60600fb45d2bb3ae140feb3c182a863b93093aafce7f93cfee", + "sha256:b750b2de4a2478db9718a02395cb9da8698901ba02378d60037d6369ecb6bb88" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "duckdb": { "hashes": [ @@ -419,11 +419,11 @@ }, "tzdata": { "hashes": [ - "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", - "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9" + "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", + "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7" ], "markers": "python_version >= '2'", - "version": "==2025.2" + "version": "==2025.3" }, "urllib3": { "hashes": [ @@ -486,39 +486,38 @@ }, "boto3": { "hashes": [ - "sha256:c5cb2ada690c14e2dfa1e1c59ef7ef399c5e381f5514f1541d28310e35192300", - "sha256:eda49046c0f6a21ac159f9b2d609e5cc70d1dd019b7ac9618eec99285282b3db" + "sha256:4c9a62dcb5c3f905630fe99fb4b81131da84c5c92eedcc81a89cbd924c1c524f", + "sha256:9d6aad3fa8b90567006bf7b32efa26489fc306fbe63946eaf57b72356a45761d" ], - "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "boto3-stubs": { "extras": [ "essential" ], "hashes": [ - "sha256:2d3f9e10686ca7ca054b788450fcc3d505b9f74954817afd994f71742df6d883", - "sha256:e4f153103a3e67a50aa98b9141382a16d37e943aef957319fe3a48e9b4a33a5a" + "sha256:2683835a105262e1d7404f638383c230d58178a31ffcd88ad70db941a8274427", + "sha256:de769f1e414ce4a69fa2f79c21054242501c27fbe964c69cde797fcc18e681c1" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "botocore": { "hashes": [ - "sha256:92128d56654342f026d5c20a92bf0e8b546be1eb38df2c0efc7433e8bbc39045", - "sha256:cc401b4836eae2a781efa1d1df88b2e92f9245885a6ae1bf9a6b26bc97b3efd2" + "sha256:7e4cf14bd5719b60600fb45d2bb3ae140feb3c182a863b93093aafce7f93cfee", + "sha256:b750b2de4a2478db9718a02395cb9da8698901ba02378d60037d6369ecb6bb88" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "botocore-stubs": { "hashes": [ - "sha256:951fc41e78e1a53b49bfe8511bc24e430eda4c689ca6033c643693e56485e69c", - "sha256:d5c9ac851e8d7ce30d25204add0e2448cb23cb5f8fbe6b5c768ea1e4471b4455" + "sha256:7a1d5749a5088fa3184add28efad1e6548039b40d8d00413c69021c0c31501ee", + "sha256:d565f01c31034e7fd86193c0f4937179a67a971098947c0d1e83d81973a9ec4c" ], "markers": "python_version >= '3.9'", - "version": "==1.42.7" + "version": "==1.42.13" }, "cachecontrol": { "extras": [ @@ -981,11 +980,11 @@ }, "filelock": { "hashes": [ - "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", - "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4" + "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", + "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c" ], "markers": "python_version >= '3.10'", - "version": "==3.20.0" + "version": "==3.20.1" }, "identify": { "hashes": [ @@ -1054,85 +1053,85 @@ }, "librt": { "hashes": [ - "sha256:020c6db391268bcc8ce75105cb572df8cb659a43fd347366aaa407c366e5117a", - "sha256:0fa9ac2e49a6bee56e47573a6786cb635e128a7b12a0dc7851090037c0d397a3", - "sha256:11ad45122bbed42cfc8b0597450660126ef28fd2d9ae1a219bc5af8406f95678", - "sha256:120dd21d46ff875e849f1aae19346223cf15656be489242fe884036b23d39e93", - "sha256:14569ac5dd38cfccf0a14597a88038fb16811a6fede25c67b79c6d50fc2c8fdc", - "sha256:1617bea5ab31266e152871208502ee943cb349c224846928a1173c864261375e", - "sha256:170cdb8436188347af17bf9cccf3249ba581c933ed56d926497119d4cf730cec", - "sha256:1975eda520957c6e0eb52d12968dd3609ffb7eef05d4223d097893d6daf1d8a7", - "sha256:1fe603877e1865b5fd047a5e40379509a4a60204aa7aa0f72b16f7a41c3f0712", - "sha256:24d70810f6e2ea853ff79338001533716b373cc0f63e2a0be5bc96129edb5fb5", - "sha256:256793988bff98040de23c57cf36e1f4c2f2dc3dcd17537cdac031d3b681db71", - "sha256:25711f364c64cab2c910a0247e90b51421e45dbc8910ceeb4eac97a9e132fc6f", - "sha256:2682162855a708e3270eba4b92026b93f8257c3e65278b456c77631faf0f4f7a", - "sha256:2cf9d73499486ce39eebbff5f42452518cc1f88d8b7ea4a711ab32962b176ee2", - "sha256:2e40520c37926166c24d0c2e0f3bc3a5f46646c34bdf7b4ea9747c297d6ee809", - "sha256:2e980cf1ed1a2420a6424e2ed884629cdead291686f1048810a817de07b5eb18", - "sha256:2f03484b54bf4ae80ab2e504a8d99d20d551bfe64a7ec91e218010b467d77093", - "sha256:35f1609e3484a649bb80431310ddbec81114cd86648f1d9482bc72a3b86ded2e", - "sha256:399938edbd3d78339f797d685142dd8a623dfaded023cf451033c85955e4838a", - "sha256:399bbd7bcc1633c3e356ae274a1deb8781c7bf84d9c7962cc1ae0c6e87837292", - "sha256:3ec50cf65235ff5c02c5b747748d9222e564ad48597122a361269dd3aa808798", - "sha256:3edbf257c40d21a42615e9e332a6b10a8bacaaf58250aed8552a14a70efd0d65", - "sha256:440c788f707c061d237c1e83edf6164ff19f5c0f823a3bf054e88804ebf971ec", - "sha256:44b3689b040df57f492e02cd4f0bacd1b42c5400e4b8048160c9d5e866de8abe", - "sha256:4887c29cadbdc50640179e3861c276325ff2986791e6044f73136e6e798ff806", - "sha256:5460d99ed30f043595bbdc888f542bad2caeb6226b01c33cda3ae444e8f82d42", - "sha256:550fdbfbf5bba6a2960b27376ca76d6aaa2bd4b1a06c4255edd8520c306fcfc0", - "sha256:56f2a47beda8409061bc1c865bef2d4bd9ff9255219402c0817e68ab5ad89aed", - "sha256:572a24fc5958c61431da456a0ef1eeea6b4989d81eeb18b8e5f1f3077592200b", - "sha256:59cb0470612d21fa1efddfa0dd710756b50d9c7fb6c1236bbf8ef8529331dc70", - "sha256:6038ccbd5968325a5d6fd393cf6e00b622a8de545f0994b89dd0f748dcf3e19e", - "sha256:6488e69d408b492e08bfb68f20c4a899a354b4386a446ecd490baff8d0862720", - "sha256:687403cced6a29590e6be6964463835315905221d797bc5c934a98750fe1a9af", - "sha256:6b407c23f16ccc36614c136251d6b32bf30de7a57f8e782378f1107be008ddb0", - "sha256:6b4e7bff1d76dd2b46443078519dc75df1b5e01562345f0bb740cea5266d8218", - "sha256:6bdd9adfca615903578d2060ee8a6eb1c24eaf54919ff0ddc820118e5718931b", - "sha256:6eb9295c730e26b849ed1f4022735f36863eb46b14b6e10604c1c39b8b5efaea", - "sha256:703456146dc2bf430f7832fd1341adac5c893ec3c1430194fdcefba00012555c", - "sha256:754a0d09997095ad764ccef050dd5bf26cbf457aab9effcba5890dad081d879e", - "sha256:7af7785f5edd1f418da09a8cdb9ec84b0213e23d597413e06525340bcce1ea4f", - "sha256:7b29e97273bd6999e2bfe9fe3531b1f4f64effd28327bced048a33e49b99674a", - "sha256:7b4f57f7a0c65821c5441d98c47ff7c01d359b1e12328219709bdd97fdd37f90", - "sha256:8837d5a52a2d7aa9f4c3220a8484013aed1d8ad75240d9a75ede63709ef89055", - "sha256:8ccadf260bb46a61b9c7e89e2218f6efea9f3eeaaab4e3d1f58571890e54858e", - "sha256:8d8cf653e798ee4c4e654062b633db36984a1572f68c3aa25e364a0ddfbbb910", - "sha256:93b2a1f325fefa1482516ced160c8c7b4b8d53226763fa6c93d151fa25164207", - "sha256:9f0e0927efe87cd42ad600628e595a1a0aa1c64f6d0b55f7e6059079a428641a", - "sha256:a59a69deeb458c858b8fea6acf9e2acd5d755d76cd81a655256bc65c20dfff5b", - "sha256:a9f9b661f82693eb56beb0605156c7fca57f535704ab91837405913417d6990b", - "sha256:abfc57cab3c53c4546aee31859ef06753bfc136c9d208129bad23e2eca39155a", - "sha256:aca73d70c3f553552ba9133d4a09e767dcfeee352d8d8d3eb3f77e38a3beb3ed", - "sha256:adeaa886d607fb02563c1f625cf2ee58778a2567c0c109378da8f17ec3076ad7", - "sha256:b278a9248a4e3260fee3db7613772ca9ab6763a129d6d6f29555e2f9b168216d", - "sha256:b7c1239b64b70be7759554ad1a86288220bbb04d68518b527783c4ad3fb4f80b", - "sha256:bf8c7735fbfc0754111f00edda35cf9e98a8d478de6c47b04eaa9cef4300eaa7", - "sha256:c634a0a6db395fdaba0361aa78395597ee72c3aad651b9a307a3a7eaf5efd67e", - "sha256:cad9971881e4fec00d96af7eaf4b63aa7a595696fc221808b0d3ce7ca9743258", - "sha256:cbdb3f337c88b43c3b49ca377731912c101178be91cb5071aac48faa898e6f8e", - "sha256:cd8551aa21df6c60baa2624fd086ae7486bdde00c44097b32e1d1b1966e365e0", - "sha256:d09f677693328503c9e492e33e9601464297c01f9ebd966ea8fc5308f3069bfd", - "sha256:d376a35c6561e81d2590506804b428fc1075fcc6298fc5bb49b771534c0ba010", - "sha256:d39079379a9a28e74f4d57dc6357fa310a1977b51ff12239d7271ec7e71d67f5", - "sha256:d86f94743a11873317094326456b23f8a5788bad9161fd2f0e52088c33564620", - "sha256:d91e60ac44bbe3a77a67af4a4c13114cbe9f6d540337ce22f2c9eaf7454ca71f", - "sha256:d9883b2d819ce83f87ba82a746c81d14ada78784db431e57cc9719179847376e", - "sha256:e094e445c37c57e9ec612847812c301840239d34ccc5d153a982fa9814478c60", - "sha256:e19acfde38cb532a560b98f473adc741c941b7a9bc90f7294bc273d08becb58b", - "sha256:e32d43610dff472eab939f4d7fbdd240d1667794192690433672ae22d7af8445", - "sha256:ed028fc3d41adda916320712838aec289956c89b4f0a361ceadf83a53b4c047a", - "sha256:ef59c938f72bdbc6ab52dc50f81d0637fde0f194b02d636987cea2ab30f8f55a", - "sha256:f3d4801db8354436fd3936531e7f0e4feb411f62433a6b6cb32bb416e20b529f", - "sha256:f57aca20e637750a2c18d979f7096e2c2033cc40cf7ed201494318de1182f135", - "sha256:f9da128d0edf990cf0d2ca011b02cd6f639e79286774bd5b0351245cbb5a6e51", - "sha256:fbd7351d43b80d9c64c3cfcb50008f786cc82cba0450e8599fdd64f264320bd3", - "sha256:fcb72249ac4ea81a7baefcbff74df7029c3cb1cf01a711113fa052d563639c9c", - "sha256:ff21c554304e8226bf80c3a7754be27c6c3549a9fec563a03c06ee8f494da8fc" + "sha256:022cc673e69283a42621dd453e2407cf1647e77f8bd857d7ad7499901e62376f", + "sha256:02a69369862099e37d00765583052a99d6a68af7e19b887e1b78fee0146b755a", + "sha256:037f5cb6fe5abe23f1dc058054d50e9699fcc90d0677eee4e4f74a8677636a1a", + "sha256:064a286e6ab0b4c900e228ab4fa9cb3811b4b83d3e0cc5cd816b2d0f548cb61c", + "sha256:078ae52ffb3f036396cc4aed558e5b61faedd504a3c1f62b8ae34bf95ae39d94", + "sha256:07c4d7c9305e75a0edd3427b79c7bd1d019cd7eddaa7c89dbb10e0c7946bffbb", + "sha256:0e8f864b521f6cfedb314d171630f827efee08f5c3462bcbc2244ab8e1768cd6", + "sha256:0f8cac84196d0ffcadf8469d9ded4d4e3a8b1c666095c2a291e22bf58e1e8a9f", + "sha256:0fd766bb9ace3498f6b93d32f30c0e7c8ce6b727fecbc84d28160e217bb66254", + "sha256:114722f35093da080a333b3834fff04ef43147577ed99dd4db574b03a5f7d170", + "sha256:1437c3f72a30c7047f16fd3e972ea58b90172c3c6ca309645c1c68984f05526a", + "sha256:188b4b1a770f7f95ea035d5bbb9d7367248fc9d12321deef78a269ebf46a5729", + "sha256:1b668b1c840183e4e38ed5a99f62fac44c3a3eef16870f7f17cfdfb8b47550ed", + "sha256:1c4c89fb01157dd0a3bfe9e75cd6253b0a1678922befcd664eca0772a4c6c979", + "sha256:1ef704e01cb6ad39ad7af668d51677557ca7e5d377663286f0ee1b6b27c28e5f", + "sha256:21ea710e96c1e050635700695095962a22ea420d4b3755a25e4909f2172b4ff2", + "sha256:25cc40d8eb63f0a7ea4c8f49f524989b9df901969cb860a2bc0e4bad4b8cb8a8", + "sha256:2857c875f1edd1feef3c371fbf830a61b632fb4d1e57160bb1e6a3206e6abe67", + "sha256:28f990e6821204f516d09dc39966ef8b84556ffd648d5926c9a3f681e8de8906", + "sha256:2b3ca211ae8ea540569e9c513da052699b7b06928dcda61247cb4f318122bdb5", + "sha256:2e734c2c54423c6dcc77f58a8585ba83b9f72e422f9edf09cab1096d4a4bdc82", + "sha256:3485b9bb7dfa66167d5500ffdafdc35415b45f0da06c75eb7df131f3357b174a", + "sha256:3749ef74c170809e6dee68addec9d2458700a8de703de081c888e92a8b015cf9", + "sha256:3871af56c59864d5fd21d1ac001eb2fb3b140d52ba0454720f2e4a19812404ba", + "sha256:39003fc73f925e684f8521b2dbf34f61a5deb8a20a15dcf53e0d823190ce8848", + "sha256:3ca1caedf8331d8ad6027f93b52d68ed8f8009f5c420c246a46fe9d3be06be0f", + "sha256:419eea245e7ec0fe664eb7e85e7ff97dcdb2513ca4f6b45a8ec4a3346904f95a", + "sha256:42da201c47c77b6cc91fc17e0e2b330154428d35d6024f3278aa2683e7e2daf2", + "sha256:43a2515a33f2bc17b15f7fb49ff6426e49cb1d5b2539bc7f8126b9c5c7f37164", + "sha256:4450c354b89dbb266730893862dbff06006c9ed5b06b6016d529b2bf644fc681", + "sha256:4df7c9def4fc619a9c2ab402d73a0c5b53899abe090e0100323b13ccb5a3dd82", + "sha256:4f1ee004942eaaed6e06c087d93ebc1c67e9a293e5f6b9b5da558df6bf23dc5d", + "sha256:52e34c6af84e12921748c8354aa6acf1912ca98ba60cdaa6920e34793f1a0788", + "sha256:543c42fa242faae0466fe72d297976f3c710a357a219b1efde3a0539a68a6997", + "sha256:5a72b905420c4bb2c10c87b5c09fe6faf4a76d64730e3802feef255e43dfbf5a", + "sha256:618b7459bb392bdf373f2327e477597fff8f9e6a1878fffc1b711c013d1b0da4", + "sha256:6bb15ee29d95875ad697d449fe6071b67f730f15a6961913a2b0205015ca0843", + "sha256:6fc4aa67fedd827a601f97f0e61cc72711d0a9165f2c518e9a7c38fc1568b9ad", + "sha256:70969229cb23d9c1a80e14225838d56e464dc71fa34c8342c954fc50e7516dee", + "sha256:71a56f4671f7ff723451f26a6131754d7c1809e04e22ebfbac1db8c9e6767a20", + "sha256:721a7b125a817d60bf4924e1eec2a7867bfcf64cfc333045de1df7a0629e4481", + "sha256:76b2ba71265c0102d11458879b4d53ccd0b32b0164d14deb8d2b598a018e502f", + "sha256:772e18696cf5a64afee908662fbcb1f907460ddc851336ee3a848ef7684c8e1e", + "sha256:7766b57aeebaf3f1dac14fdd4a75c9a61f2ed56d8ebeefe4189db1cb9d2a3783", + "sha256:776dbb9bfa0fc5ce64234b446995d8d9f04badf64f544ca036bd6cff6f0732ce", + "sha256:77772a4b8b5f77d47d883846928c36d730b6e612a6388c74cba33ad9eb149c11", + "sha256:7dd3b5c37e0fb6666c27cf4e2c88ae43da904f2155c4cfc1e5a2fdce3b9fcf92", + "sha256:7e4b5ffa1614ad4f32237d739699be444be28de95071bfa4e66a8da9fa777798", + "sha256:8a461f6456981d8c8e971ff5a55f2e34f4e60871e665d2f5fde23ee74dea4eeb", + "sha256:95cb80854a355b284c55f79674f6187cc9574df4dc362524e0cce98c89ee8331", + "sha256:a34ae11315d4e26326aaf04e21ccd8d9b7de983635fba38d73e203a9c8e3fe3d", + "sha256:a4f7339d9e445280f23d63dea842c0c77379c4a47471c538fc8feedab9d8d063", + "sha256:a5deebb53d7a4d7e2e758a96befcd8edaaca0633ae71857995a0f16033289e44", + "sha256:a9c5de1928c486201b23ed0cc4ac92e6e07be5cd7f3abc57c88a9cf4f0f32108", + "sha256:adefe0d48ad35b90b6f361f6ff5a1bd95af80c17d18619c093c60a20e7a5b60c", + "sha256:b35c63f557653c05b5b1b6559a074dbabe0afee28ee2a05b6c9ba21ad0d16a74", + "sha256:b370a77be0a16e1ad0270822c12c21462dc40496e891d3b0caf1617c8cc57e20", + "sha256:b4c25312c7f4e6ab35ab16211bdf819e6e4eddcba3b2ea632fb51c9a2a97e105", + "sha256:b719c8730c02a606dc0e8413287e8e94ac2d32a51153b300baf1f62347858fba", + "sha256:bc4aebecc79781a1b77d7d4e7d9fe080385a439e198d993b557b60f9117addaf", + "sha256:c2a6f1236151e6fe1da289351b5b5bce49651c91554ecc7b70a947bced6fe212", + "sha256:c66c2b245926ec15188aead25d395091cb5c9df008d3b3207268cd65557d6286", + "sha256:c96cb76f055b33308f6858b9b594618f1b46e147a4d03a4d7f0c449e304b9b95", + "sha256:c9cab4b3de1f55e6c30a84c8cee20e4d3b2476f4d547256694a1b0163da4fe32", + "sha256:ce1b44091355b68cffd16e2abac07c1cafa953fa935852d3a4dd8975044ca3bf", + "sha256:ce58420e25097b2fc201aef9b9f6d65df1eb8438e51154e1a7feb8847e4a55ab", + "sha256:d05acd46b9a52087bfc50c59dfdf96a2c480a601e8898a44821c7fd676598f74", + "sha256:d31acb5886c16ae1711741f22504195af46edec8315fe69b77e477682a87a83e", + "sha256:d44a1b1ba44cbd2fc3cb77992bef6d6fdb1028849824e1dd5e4d746e1f7f7f0b", + "sha256:d854c6dc0f689bad7ed452d2a3ecff58029d80612d336a45b62c35e917f42d23", + "sha256:dc300cb5a5a01947b1ee8099233156fdccd5001739e5f596ecfbc0dab07b5a3b", + "sha256:e710c983d29d9cc4da29113b323647db286eaf384746344f4a233708cca1a82c", + "sha256:ec72342cc4d62f38b25a94e28b9efefce41839aecdecf5e9627473ed04b7be16", + "sha256:ee8d3323d921e0f6919918a97f9b5445a7dfe647270b2629ec1008aa676c0bc0", + "sha256:f79bc3595b6ed159a1bf0cdc70ed6ebec393a874565cab7088a219cca14da727", + "sha256:f7fa8beef580091c02b4fd26542de046b2abfe0aaefa02e8bcf68acb7618f2b3" ], "markers": "python_version >= '3.9'", - "version": "==0.7.3" + "version": "==0.7.4" }, "license-expression": { "hashes": [ @@ -1340,48 +1339,48 @@ }, "mypy": { "hashes": [ - "sha256:0c01c99d626380752e527d5ce8e69ffbba2046eb8a060db0329690849cf9b6f9", - "sha256:0dde5cb375cb94deff0d4b548b993bec52859d1651e073d63a1386d392a95495", - "sha256:0e3c3d1e1d62e678c339e7ade72746a9e0325de42cd2cccc51616c7b2ed1a018", - "sha256:0ea4fd21bb48f0da49e6d3b37ef6bd7e8228b9fe41bbf4d80d9364d11adbd43c", - "sha256:0fb3115cb8fa7c5f887c8a8d81ccdcb94cff334684980d847e5a62e926910e1d", - "sha256:11f7254c15ab3f8ed68f8e8f5cbe88757848df793e31c36aaa4d4f9783fd08ab", - "sha256:120cffe120cca5c23c03c77f84abc0c14c5d2e03736f6c312480020082f1994b", - "sha256:16f76ff3f3fd8137aadf593cb4607d82634fca675e8211ad75c43d86033ee6c6", - "sha256:1cf9c59398db1c68a134b0b5354a09a1e124523f00bacd68e553b8bd16ff3299", - "sha256:318ba74f75899b0e78b847d8c50821e4c9637c79d9a59680fc1259f29338cb3e", - "sha256:3210d87b30e6af9c8faed61be2642fcbe60ef77cec64fa1ef810a630a4cf671c", - "sha256:34ec1ac66d31644f194b7c163d7f8b8434f1b49719d403a5d26c87fff7e913f7", - "sha256:37af5166f9475872034b56c5efdcf65ee25394e9e1d172907b84577120714364", - "sha256:3ad925b14a0bb99821ff6f734553294aa6a3440a8cb082fe1f5b84dfb662afb1", - "sha256:510c014b722308c9bd377993bcbf9a07d7e0692e5fa8fc70e639c1eb19fc6bee", - "sha256:6016c52ab209919b46169651b362068f632efcd5eb8ef9d1735f6f86da7853b2", - "sha256:6148ede033982a8c5ca1143de34c71836a09f105068aaa8b7d5edab2b053e6c8", - "sha256:63ea6a00e4bd6822adbfc75b02ab3653a17c02c4347f5bb0cf1d5b9df3a05835", - "sha256:7686ed65dbabd24d20066f3115018d2dce030d8fa9db01aa9f0a59b6813e9f9e", - "sha256:7a500ab5c444268a70565e374fc803972bfd1f09545b13418a5174e29883dab7", - "sha256:8f44f2ae3c58421ee05fe609160343c25f70e3967f6e32792b5a78006a9d850f", - "sha256:a18d8abdda14035c5718acb748faec09571432811af129bf0d9e7b2d6699bf18", - "sha256:a31e4c28e8ddb042c84c5e977e28a21195d086aaffaf08b016b78e19c9ef8106", - "sha256:a9ac09e52bb0f7fb912f5d2a783345c72441a08ef56ce3e17c1752af36340a39", - "sha256:b9d491295825182fba01b6ffe2c6fe4e5a49dbf4e2bb4d1217b6ced3b4797bc6", - "sha256:c14a98bc63fd867530e8ec82f217dae29d0550c86e70debc9667fff1ec83284e", - "sha256:c3385246593ac2b97f155a0e9639be906e73534630f663747c71908dfbf26134", - "sha256:cabbee74f29aa9cd3b444ec2f1e4fa5a9d0d746ce7567a6a609e224429781f53", - "sha256:cb64b0ba5980466a0f3f9990d1c582bcab8db12e29815ecb57f1408d99b4bff7", - "sha256:cf7d84f497f78b682edd407f14a7b6e1a2212b433eedb054e2081380b7395aa3", - "sha256:e2c1101ab41d01303103ab6ef82cbbfedb81c1a060c868fa7cc013d573d37ab5", - "sha256:f188dcf16483b3e59f9278c4ed939ec0254aa8a60e8fc100648d9ab5ee95a431", - "sha256:f2e36bed3c6d9b5f35d28b63ca4b727cb0228e480826ffc8953d1892ddc8999d", - "sha256:f3e19e3b897562276bb331074d64c076dbdd3e79213f36eed4e592272dabd760", - "sha256:f6b874ca77f733222641e5c46e4711648c4037ea13646fd0cdc814c2eaec2528", - "sha256:f75e60aca3723a23511948539b0d7ed514dda194bc3755eae0bfc7a6b4887aa7", - "sha256:fc51a5b864f73a3a182584b1ac75c404396a17eced54341629d8bdcb644a5bba", - "sha256:fd4a985b2e32f23bead72e2fb4bbe5d6aceee176be471243bd831d5b2644672d" + "sha256:016f2246209095e8eda7538944daa1d60e1e8134d98983b9fc1e92c1fc0cb8dd", + "sha256:022ea7279374af1a5d78dfcab853fe6a536eebfda4b59deab53cd21f6cd9f00b", + "sha256:06e6170bd5836770e8104c8fdd58e5e725cfeb309f0a6c681a811f557e97eac1", + "sha256:19d88bb05303fe63f71dd2c6270daca27cb9401c4ca8255fe50d1d920e0eb9ba", + "sha256:21761006a7f497cb0d4de3d8ef4ca70532256688b0523eee02baf9eec895e27b", + "sha256:28902ee51f12e0f19e1e16fbe2f8f06b6637f482c459dd393efddd0ec7f82045", + "sha256:2899753e2f61e571b3971747e302d5f420c3fd09650e1951e99f823bc3089dac", + "sha256:2abb24cf3f17864770d18d673c85235ba52456b36a06b6afc1e07c1fdcd3d0e6", + "sha256:34c81968774648ab5ac09c29a375fdede03ba253f8f8287847bd480782f73a6a", + "sha256:409088884802d511ee52ca067707b90c883426bd95514e8cfda8281dc2effe24", + "sha256:481daf36a4c443332e2ae9c137dfee878fcea781a2e3f895d54bd3002a900957", + "sha256:4b84a7a18f41e167f7995200a1d07a4a6810e89d29859df936f1c3923d263042", + "sha256:4f28f99c824ecebcdaa2e55d82953e38ff60ee5ec938476796636b86afa3956e", + "sha256:5f05aa3d375b385734388e844bc01733bd33c644ab48e9684faa54e5389775ec", + "sha256:7bcfc336a03a1aaa26dfce9fff3e287a3ba99872a157561cbfcebe67c13308e3", + "sha256:804bd67b8054a85447c8954215a906d6eff9cabeabe493fb6334b24f4bfff718", + "sha256:8bb5c6f6d043655e055be9b542aa5f3bdd30e4f3589163e85f93f3640060509f", + "sha256:a009ffa5a621762d0c926a078c2d639104becab69e79538a494bcccb62cc0331", + "sha256:a8174a03289288c1f6c46d55cef02379b478bfbc8e358e02047487cad44c6ca1", + "sha256:ab43590f9cd5108f41aacf9fca31841142c786827a74ab7cc8a2eacb634e09a1", + "sha256:b10e7c2cd7870ba4ad9b2d8a6102eb5ffc1f16ca35e3de6bfa390c1113029d13", + "sha256:b13cfdd6c87fc3efb69ea4ec18ef79c74c3f98b4e5498ca9b85ab3b2c2329a67", + "sha256:b64d987153888790bcdb03a6473d321820597ab8dd9243b27a92153c4fa50fd2", + "sha256:b7951a701c07ea584c4fe327834b92a30825514c868b1f69c30445093fdd9d5a", + "sha256:bdb12f69bcc02700c2b47e070238f42cb87f18c0bc1fc4cdb4fb2bc5fd7a3b8b", + "sha256:c35d298c2c4bba75feb2195655dfea8124d855dfd7343bf8b8c055421eaf0cf8", + "sha256:c608937067d2fc5a4dd1a5ce92fd9e1398691b8c5d012d66e1ddd430e9244376", + "sha256:c9a6538e0415310aad77cb94004ca6482330fece18036b5f360b62c45814c4ef", + "sha256:d8dfc6ab58ca7dda47d9237349157500468e404b17213d44fc1cb77bce532288", + "sha256:da4869fc5e7f62a88f3fe0b5c919d1d9f7ea3cef92d3689de2823fd27e40aa75", + "sha256:de759aafbae8763283b2ee5869c7255391fbc4de3ff171f8f030b5ec48381b74", + "sha256:e3157c7594ff2ef1634ee058aafc56a82db665c9438fd41b390f3bde1ab12250", + "sha256:e3f276d8493c3c97930e354b2595a44a21348b320d859fb4a2b9f66da9ed27ab", + "sha256:ee4c11e460685c3e0c64a4c5de82ae143622410950d6be863303a1c4ba0e36d6", + "sha256:f1235f5ea01b7db5468d53ece6aaddf1ad0b88d9e7462b86ef96fe04995d7247", + "sha256:f7cee03c9a2e2ee26ec07479f38ea9c884e301d42c6d43a19d20fb014e3ba925", + "sha256:f859fb09d9583a985be9a493d5cfc5515b56b08f7447759a0c5deaf68d80506e", + "sha256:ffcebe56eb09ff0c0885e750036a095e23793ba6c2e894e7e63f6d89ad51f22e" ], "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==1.19.0" + "version": "==1.19.1" }, "mypy-boto3-cloudformation": { "hashes": [ @@ -1401,19 +1400,19 @@ }, "mypy-boto3-ec2": { "hashes": [ - "sha256:1b54eaa6403c10677496f7dccdd0c2911533b1f26e4a5732af56fbd31141796f", - "sha256:bc89dcbd7057bd58fd4dc9956ae4d581c35c908c4920318d954ec41fe507ee37" + "sha256:a7e392906fac6a3aeb78edd990f9781de435f437be376bebe9e137a44143f73f", + "sha256:d86406842e8fdeb3dda2097372bc51ecddebf3903e206b9ffb7472ee2660d2b8" ], "markers": "python_version >= '3.9'", - "version": "==1.42.5" + "version": "==1.42.13" }, "mypy-boto3-lambda": { "hashes": [ - "sha256:12e810b4c7d37be3c4d83e6c6bf638da4f56ad191495ddbeb589929f2270f9f0", - "sha256:c1ecd68b1c2fa89ff18d32cbf3b12cdbb078b8a5bbbc95da80f9a61bfcd1229d" + "sha256:55deadbfaf0e5f118237831a84d35f48dc7164ce2bf7efdcb54f54aef4025602", + "sha256:fbb6646138520c675a4c4adff334e830b010d5c077dee8d5187346809ebb6f72" ], "markers": "python_version >= '3.9'", - "version": "==1.42.3" + "version": "==1.42.8" }, "mypy-boto3-rds": { "hashes": [ @@ -1425,11 +1424,11 @@ }, "mypy-boto3-s3": { "hashes": [ - "sha256:2507bdfa17829f1f422b8bf334db836689b0529fbe6635af4e0f3aaa72f92806", - "sha256:97c5171928a2ae8c7b60a60700f395407cb5eca60704c7ab6a1ff0861f4db997" + "sha256:9a4575124b500c29c023919f17b022e66109a56ba2318ef8aeab3d0dd2cd174e", + "sha256:e5f6fb51f215b30255ee076712032c6810b274a20062d5fa2ecd7816ac1a1274" ], "markers": "python_version >= '3.9'", - "version": "==1.42.3" + "version": "==1.42.10" }, "mypy-boto3-sqs": { "hashes": [ @@ -1635,12 +1634,12 @@ }, "pre-commit": { "hashes": [ - "sha256:25e2ce09595174d9c97860a95609f9f852c0614ba602de3561e267547f2335e1", - "sha256:dc5a065e932b19fc1d4c653c6939068fe54325af8e741e74e88db4d28a4dd66b" + "sha256:3b3afd891e97337708c1674210f8eba659b52a38ea5f822ff142d10786221f77", + "sha256:eb545fcff725875197837263e977ea257a402056661f09dae08e4b149b030a61" ], "index": "pypi", "markers": "python_version >= '3.10'", - "version": "==4.5.0" + "version": "==4.5.1" }, "prompt-toolkit": { "hashes": [ @@ -1725,18 +1724,17 @@ "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", "sha256:f963ba8c3b0199f9d6b794c90ec77545e05eadc83973897a4523c9e8d84e9340" ], - "index": "pypi", "markers": "python_version >= '3.10'", "version": "==22.0.0" }, "pyarrow-stubs": { "hashes": [ - "sha256:a53793149bcbf41670acb8cd843645e3bf0469e4aa035824adda61c48fa900c6", - "sha256:eab02b02d4d74d86619b7f7b9fe6e7ddfe425cedf8c31aa4c7fd33cdb3b189be" + "sha256:0634e70388cd23e7c78e2abbb1989822edc34df2d2ff4fd50a2316dd0cdafd9f", + "sha256:92c1fda4998f0c13e608d8abc7e4b8537e3ef108f6bf42c58e5af97e7d143e75" ], "index": "pypi", "markers": "python_version >= '3.9' and python_version < '4'", - "version": "==20.0.0.20251209" + "version": "==20.0.0.20251215" }, "pycparser": { "hashes": [ @@ -1901,29 +1899,29 @@ }, "ruff": { "hashes": [ - "sha256:15f04cb45c051159baebb0f0037f404f1dc2f15a927418f29730f411a79bc4e7", - "sha256:1af35c2d62633d4da0521178e8a2641c636d2a7153da0bac1b30cfd4ccd91344", - "sha256:1d62cb310c4fbcb9ee4ac023fe17f984ae1e12b8a4a02e3d21489f9a2a5f730c", - "sha256:21d48fa744c9d1cb8d71eb0a740c4dd02751a5de9db9a730a8ef75ca34cf138e", - "sha256:25add4575ffecc53d60eed3f24b1e934493631b48ebbc6ebaf9d8517924aca4b", - "sha256:2c87e09b3cd9d126fc67a9ecd3b5b1d3ded2b9c7fce3f16e315346b9d05cfb52", - "sha256:2e2fcbefe91f9fad0916850edf0854530c15bd1926b6b779de47e9ab619ea38f", - "sha256:4c943d847b7f02f7db4201a0600ea7d244d8a404fbb639b439e987edcf2baf9a", - "sha256:774ed0dd87d6ce925e3b8496feb3a00ac564bea52b9feb551ecd17e0a23d1eed", - "sha256:7aaf2974f378e6b01d1e257c6948207aec6a9b5ba53fab23d0182efb887a0e4a", - "sha256:8cdb162a7159f4ca36ce980a18c43d8f036966e7f73f866ac8f493b75e0c27e9", - "sha256:965a582c93c63fe715fd3e3f8aa37c4b776777203d8e1d8aa3cc0c14424a4b99", - "sha256:9eeb0b24242b5bbff3011409a739929f497f3fb5fe3b5698aba5e77e8c833097", - "sha256:a9d70721066a296f45786ec31916dc287b44040f553da21564de0ab4d45a869b", - "sha256:cb6e8bf7b4f627548daa1b69283dac5a296bfe9ce856703b03130732e20ddfe2", - "sha256:e5758ca513c43ad8a4ef13f0f081f80f08008f410790f3611a21a92421ab045b", - "sha256:ec071e9c82eca417f6111fd39f7043acb53cd3fde9b1f95bbed745962e345afb", - "sha256:eed28f6fafcc9591994c42254f5a5c5ca40e69a30721d2ab18bb0bb3baac3ab6", - "sha256:f74f7ba163b6e85a8d81a590363bf71618847e5078d90827749bfda1d88c9cdf" + "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6", + "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405", + "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a", + "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f", + "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154", + "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e", + "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f", + "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830", + "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f", + "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77", + "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f", + "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49", + "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4", + "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d", + "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935", + "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60", + "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d", + "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6", + "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d" ], "index": "pypi", "markers": "python_version >= '3.7'", - "version": "==0.14.8" + "version": "==0.14.10" }, "s3transfer": { "hashes": [ @@ -2030,11 +2028,11 @@ }, "types-awscrt": { "hashes": [ - "sha256:3f5d1e6c99b0b551af6365f9c04d8ce2effbcfe18bb719a34501efea279ae7bb", - "sha256:41e01e14d646877bd310e7e3c49ff193f8361480b9568e97b1639775009bbefa" + "sha256:362fd8f5eaebcfcd922cb9fd8274fb375df550319f78031ee3779eac0b9ecc79", + "sha256:8204126e01a00eaa4a746e7a0076538ca0e4e3f52408adec0ab9b471bb0bb64b" ], "markers": "python_version >= '3.8'", - "version": "==0.29.2" + "version": "==0.30.0" }, "types-pytz": { "hashes": [ From c59f2d192c3eaf1426b3562d01c3abe6367f1b58 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 19 Dec 2025 09:12:44 -0500 Subject: [PATCH 2/3] Refactor class and duckdb connection relationships Why these changes are being introduced: This refactoring work was a long time coming, inspired by a recent need to gracefully handle a read request for embeddings against a dataset without embeddings parquet files. If we can normalize how and when tables are created, and the handling of duckdb connections, we can normalize handling read requests for data that may not be available (yet). As such, this refactoring work will help normalize read edge cases now and going forward. This library was built in stages. First was TIMDEXDataset, which read parquet files directly. Then TIMDEXDatasetMetadata, which more formally introduced DuckDB. It handled the connection creation and configuration. This connection was shared with TIMDEXDataset as we leaned into DuckDB reading. Lastly, TIMDEXEmbeddings was added as our first new "source" of data. This class shared the connection from TIMDEXDataset. Both TIMDEXDatasetMetadata and TIMDEXEmbeddings were doing their own SQLAlchemy table reflections. TIMDEXDatasetMetadata could be instantiated on its own, while TIMDEXEmbeddings was assumed to take an instance of TIMDEXDataset. At this point, while things worked, it was clear that a refactor would be beneficial. We needed clearer responsibility of what created and configured the DuckDB connection, solidify that TIMDEXDatasetMetadata and TIMDEXEmbeddings are components on TIMDEXDataset, and how and when SQLAlchemy reflection was performed. Aligning all these things will make responding to these read and write edge cases much easier. How this addresses that need: - A new factory class is created DuckDBConnectionFactory that is responsible for creating and configuring any DuckDB connections used. - Both TIMDEXDatasetMetadata and TIMDEXEmbeddings require a TIMDEXDataset instance, and then themselves become components on TIMDEXDataset. We can more accurately call them "components" then of the primary TIMDEXDataset. - TIMDEXDataset handles the creation of a DuckDB connection via the new factory, and this connection is then accesible to its components TIMDEXDatasetMetadata and TIMDEXEmbeddings (maybe more in the future) - TIMDEXDataset is also responsible for all SQLAlchemy reflection, saving to self.sa_tables. In this way, any component that may want a SQLAlchemy instance, e.g. for reading, it can get it from `self.timdex_dataset.get_sa_table(, TIMDEXDataset: @pytest.fixture(scope="module") def timdex_metadata(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata: """TIMDEXDatasetMetadata with static database file created.""" - metadata = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) - metadata.rebuild_dataset_metadata() - metadata.refresh() - return metadata + timdex_dataset_with_runs.metadata.rebuild_dataset_metadata() + return timdex_dataset_with_runs.metadata @pytest.fixture(scope="module") @@ -247,9 +246,9 @@ def timdex_dataset_with_runs_with_metadata( @pytest.fixture -def timdex_metadata_empty(timdex_dataset_with_runs) -> TIMDEXDatasetMetadata: +def timdex_metadata_empty(timdex_dataset_empty) -> TIMDEXDatasetMetadata: """TIMDEXDatasetMetadata without static database file.""" - return TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + return timdex_dataset_empty.metadata @pytest.fixture @@ -271,7 +270,8 @@ def timdex_metadata_with_deltas( ) td.write(records) - return TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + # return fresh TIMDEXDataset's metadata + return TIMDEXDataset(timdex_dataset_with_runs.location).metadata @pytest.fixture @@ -286,12 +286,11 @@ def timdex_metadata_merged_deltas( # clone dataset with runs using new dataset location td = TIMDEXDataset(dataset_location, config=timdex_dataset_with_runs.config) - # clone metadata and merge append deltas - metadata = TIMDEXDatasetMetadata(td.location) - metadata.merge_append_deltas() - metadata.refresh() + # merge append deltas via the TD's metadata + td.metadata.merge_append_deltas() + td.refresh() - return metadata + return td.metadata # ================================================================================ diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py index 09a22c4..fef7fc5 100644 --- a/tests/test_embeddings.py +++ b/tests/test_embeddings.py @@ -152,12 +152,13 @@ def test_embeddings_read_batches_yields_pyarrow_record_batches( timdex_dataset_empty.metadata.rebuild_dataset_metadata() timdex_dataset_empty.refresh() - # write embeddings - timdex_embeddings = TIMDEXEmbeddings(timdex_dataset_empty) - timdex_embeddings.write(sample_embeddings_generator(100, run_id="test-run")) - timdex_embeddings = TIMDEXEmbeddings(timdex_dataset_empty) + # write embeddings and refresh to pick up new views + timdex_dataset_empty.embeddings.write( + sample_embeddings_generator(100, run_id="test-run") + ) + timdex_dataset_empty.refresh() - batches = timdex_embeddings.read_batches_iter() + batches = timdex_dataset_empty.embeddings.read_batches_iter() batch = next(batches) assert isinstance(batch, pa.RecordBatch) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 3674f63..af94193 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -6,7 +6,7 @@ from duckdb import DuckDBPyConnection -from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata +from timdex_dataset_api import TIMDEXDataset ORDERED_METADATA_COLUMN_NAMES = [ "timdex_record_id", @@ -21,29 +21,33 @@ ] -def test_tdm_init_no_metadata_file_warning_success(caplog, timdex_dataset_with_runs): - TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) - +def test_tdm_init_no_metadata_file_warning_success(caplog, tmp_path): + # creating a new TIMDEXDataset will log warning if no metadata file + caplog.set_level("WARNING") + TIMDEXDataset(str(tmp_path / "new_empty_dataset")) assert "Static metadata database not found" in caplog.text def test_tdm_local_dataset_structure_properties(tmp_path): local_root = str(Path(tmp_path) / "path/to/nothing") - tdm_local = TIMDEXDatasetMetadata(local_root) - assert tdm_local.location == local_root - assert tdm_local.location_scheme == "file" + td_local = TIMDEXDataset(local_root) + assert td_local.metadata.location == local_root + assert td_local.metadata.location_scheme == "file" -def test_tdm_s3_dataset_structure_properties(s3_bucket_mocked): - s3_root = "s3://timdex/dataset" - tdm_s3 = TIMDEXDatasetMetadata(s3_root) - assert tdm_s3.location == s3_root - assert tdm_s3.location_scheme == "s3" +def test_tdm_s3_dataset_structure_properties(timdex_dataset_empty): + # test that location_scheme property works correctly for local paths + # S3 tests require full mocking and are covered in other tests + assert timdex_dataset_empty.metadata.location_scheme == "file" -def test_tdm_create_metadata_database_file_success(caplog, timdex_metadata_empty): +def test_tdm_create_metadata_database_file_success( + caplog, timdex_dataset_with_runs, timdex_metadata_empty +): caplog.set_level("DEBUG") - timdex_metadata_empty.rebuild_dataset_metadata() + # use a fresh dataset from timdex_dataset_with_runs location + td = TIMDEXDataset(timdex_dataset_with_runs.location) + td.metadata.rebuild_dataset_metadata() def test_tdm_init_metadata_file_found_success(timdex_metadata): @@ -321,15 +325,15 @@ def test_tdm_merge_append_deltas_deletes_append_deltas( assert not os.listdir(timdex_metadata_merged_deltas.append_deltas_path) -def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid( +def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid( monkeypatch, tmp_path_factory, timdex_dataset_with_runs ): preset_home = tmp_path_factory.mktemp("my-account") monkeypatch.setenv("HOME", str(preset_home)) - tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + td = TIMDEXDataset(timdex_dataset_with_runs.location) df = ( - tdm.conn.query( + td.conn.query( """ select current_setting('secret_directory') as secret_directory, @@ -344,15 +348,15 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid( assert df.extension_directory == "" # expected and okay when HOME set -def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_unset( +def test_td_prepare_duckdb_secret_and_extensions_home_env_var_unset( monkeypatch, timdex_dataset_with_runs ): monkeypatch.delenv("HOME", raising=False) - tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + td = TIMDEXDataset(timdex_dataset_with_runs.location) df = ( - tdm.conn.query( + td.conn.query( """ select current_setting('secret_directory') as secret_directory, @@ -367,15 +371,15 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_unset( assert df.extension_directory == "/tmp/.duckdb/extensions" -def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty( +def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty( monkeypatch, timdex_dataset_with_runs ): monkeypatch.setenv("HOME", "") # simulate AWS Lambda environment - tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location) + td = TIMDEXDataset(timdex_dataset_with_runs.location) df = ( - tdm.conn.query( + td.conn.query( """ select current_setting('secret_directory') as secret_directory, @@ -390,14 +394,16 @@ def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty( assert df.extension_directory == "/tmp/.duckdb/extensions" -def test_tdm_preload_current_records_default_false(tmp_path): - tdm = TIMDEXDatasetMetadata(str(tmp_path)) - assert tdm.preload_current_records is False +def test_td_preload_current_records_default_false(tmp_path): + td = TIMDEXDataset(str(tmp_path)) + assert td.preload_current_records is False + assert td.metadata.preload_current_records is False -def test_tdm_preload_current_records_flag_true(tmp_path): - tdm = TIMDEXDatasetMetadata(str(tmp_path), preload_current_records=True) - assert tdm.preload_current_records is True +def test_td_preload_current_records_flag_true(tmp_path): + td = TIMDEXDataset(str(tmp_path), preload_current_records=True) + assert td.preload_current_records is True + assert td.metadata.preload_current_records is True def test_tdm_preload_false_no_temp_table(timdex_dataset_with_runs): diff --git a/tests/test_read.py b/tests/test_read.py index 89a5ce2..9fb8c0c 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -255,7 +255,6 @@ def test_dataset_load_current_records_gets_correct_same_day_full_run( ): # ensure metadata exists for this dataset timdex_dataset_same_day_runs.metadata.rebuild_dataset_metadata() - timdex_dataset_same_day_runs.metadata.refresh() df = timdex_dataset_same_day_runs.read_dataframe( table="current_records", run_type="full" ) @@ -266,7 +265,6 @@ def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering( timdex_dataset_same_day_runs, ): timdex_dataset_same_day_runs.metadata.rebuild_dataset_metadata() - timdex_dataset_same_day_runs.metadata.refresh() first_record = next( timdex_dataset_same_day_runs.read_dicts_iter( table="current_records", run_type="daily" diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py index 61df087..71f17df 100644 --- a/timdex_dataset_api/dataset.py +++ b/timdex_dataset_api/dataset.py @@ -16,12 +16,15 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as ds -from duckdb import DuckDBPyConnection +from duckdb_engine import ConnectionWrapper from pyarrow import fs +from sqlalchemy import MetaData, Table, create_engine +from sqlalchemy.types import ARRAY, FLOAT from timdex_dataset_api.config import configure_logger from timdex_dataset_api.embeddings import TIMDEXEmbeddings from timdex_dataset_api.metadata import TIMDEXDatasetMetadata +from timdex_dataset_api.utils import DuckDBConnectionFactory if TYPE_CHECKING: from timdex_dataset_api.record import DatasetRecord # pragma: nocover @@ -78,6 +81,10 @@ class TIMDEXDatasetConfig: from a dataset; pyarrow default is 16 - fragment_read_ahead: number of fragments to optimistically read ahead when batch reaching from a dataset; pyarrow default is 4 + - duckdb_join_batch_size: batch size for keyset pagination when joining metadata + + Note: DuckDB connection settings (memory_limit, threads) are handled by + DuckDBConnectionFactory via TDA_DUCKDB_MEMORY_LIMIT and TDA_DUCKDB_THREADS env vars. """ read_batch_size: int = field( @@ -132,18 +139,21 @@ def __init__( self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS self.dataset = self.load_pyarrow_dataset() - # dataset metadata - self.metadata = TIMDEXDatasetMetadata( - location, - preload_current_records=preload_current_records, - ) + # create DuckDB connection used by all classes + self.conn_factory = DuckDBConnectionFactory(location_scheme=self.location_scheme) + self.conn = self.conn_factory.create_connection() - # DuckDB context - self.conn = self.setup_duckdb_context() + # create schemas + self._create_duckdb_schemas() - # dataset embeddings + # composed components receive self + self.metadata = TIMDEXDatasetMetadata(self) self.embeddings = TIMDEXEmbeddings(self) + # SQLAlchemy (SA) reflection after components have set up their views + self.sa_tables: dict[str, dict[str, Table]] = {} + self.reflect_sa_tables() + @property def location_scheme(self) -> Literal["file", "s3"]: scheme = urlparse(self.location).scheme @@ -158,7 +168,7 @@ def data_records_root(self) -> str: return f"{self.location.removesuffix('/')}/data/records" # type: ignore[union-attr] def refresh(self) -> None: - """Fully reload TIMDEXDataset instance.""" + """Refresh dataset by fully reinitializing.""" self.__init__( # type: ignore[misc] self.location, config=self.config, @@ -245,24 +255,54 @@ def get_s3_filesystem() -> fs.FileSystem: session_token=credentials.token, ) - def setup_duckdb_context(self) -> DuckDBPyConnection: - """Create a DuckDB connection that metadata and data query and retrieval. + def _create_duckdb_schemas(self) -> None: + """Create DuckDB schemas used by all components.""" + self.conn.execute("create schema metadata;") + self.conn.execute("create schema data;") - This method extends TIMDEXDatasetMetadata's pre-existing DuckDB connection, adding - a 'data' schema and any other configurations needed. + def reflect_sa_tables(self, schemas: list[str] | None = None) -> None: + """Reflect SQLAlchemy metadata for DuckDB schemas. + + This centralizes SA reflection for all composed components. Reflected tables + are stored in self.sa_tables as {schema: {table_name: Table}}. + + Args: + schemas: list of schemas to reflect; defaults to ["metadata", "data"] """ start_time = time.perf_counter() + schemas = schemas or ["metadata", "data"] - conn = self.metadata.conn + engine = create_engine( + "duckdb://", + creator=lambda: ConnectionWrapper(self.conn), + ) + + for schema in schemas: + db_metadata = MetaData() + db_metadata.reflect(bind=engine, schema=schema, views=True) + + # store tables in flat dict keyed by table name (without schema prefix) + self.sa_tables[schema] = { + table_name.removeprefix(f"{schema}."): table + for table_name, table in db_metadata.tables.items() + } - # create data schema - conn.execute("""create schema data;""") + # type fixup for embedding_vector column (DuckDB LIST -> SA ARRAY) + if "embeddings" in self.sa_tables.get("data", {}): + self.sa_tables["data"]["embeddings"].c.embedding_vector.type = ARRAY(FLOAT) logger.debug( - "DuckDB context created for TIMDEXDataset, " - f"{round(time.perf_counter()-start_time,2)}s" + f"SQLAlchemy reflection complete for schemas {schemas}, " + f"{round(time.perf_counter() - start_time, 3)}s" ) - return conn + + def get_sa_table(self, schema: str, table: str) -> Table: + """Get a reflected SQLAlchemy Table by schema and table name.""" + if schema not in self.sa_tables: + raise ValueError(f"Schema '{schema}' not found in reflected SA tables.") + if table not in self.sa_tables[schema]: + raise ValueError(f"Table '{table}' not found in schema '{schema}'.") + return self.sa_tables[schema][table] def write( self, @@ -326,7 +366,7 @@ def write( if write_append_deltas: for written_file in written_files: self.metadata.write_append_delta_duckdb(written_file.path) # type: ignore[attr-defined] - self.metadata.refresh() + self.refresh() self.log_write_statistics(start_time, written_files) @@ -575,9 +615,7 @@ def _iter_data_chunks(self, data_query: str) -> Iterator[pa.RecordBatch]: ) finally: if self.location_scheme == "s3": - self.conn.execute( - f"""set threads={self.metadata.config.duckdb_connection_threads};""" - ) + self.conn.execute(f"""set threads={self.conn_factory.threads};""") def read_dataframes_iter( self, diff --git a/timdex_dataset_api/embeddings.py b/timdex_dataset_api/embeddings.py index 00655ff..92a1465 100644 --- a/timdex_dataset_api/embeddings.py +++ b/timdex_dataset_api/embeddings.py @@ -14,11 +14,10 @@ from duckdb import DuckDBPyConnection from duckdb import IOException as DuckDBIOException from duckdb_engine import Dialect as DuckDBDialect -from sqlalchemy import Table, and_, select, text -from sqlalchemy.types import ARRAY, FLOAT +from sqlalchemy import and_, select, text from timdex_dataset_api.record import datetime_iso_parse -from timdex_dataset_api.utils import build_filter_expr_sa, sa_reflect_duckdb_conn +from timdex_dataset_api.utils import build_filter_expr_sa if TYPE_CHECKING: from timdex_dataset_api import TIMDEXDataset @@ -148,56 +147,35 @@ def __init__(self, timdex_dataset: "TIMDEXDataset"): - timdex_dataset: instance of TIMDEXDataset """ self.timdex_dataset = timdex_dataset + self.conn = timdex_dataset.conn self.schema = TIMDEX_DATASET_EMBEDDINGS_SCHEMA self.partition_columns = ["year", "month", "day"] - # DuckDB context - self.conn = self.setup_duckdb_context() - self._sa_metadata_data_schema = sa_reflect_duckdb_conn(self.conn, schema="data") - - # resolve data type for 'embedding_vector' column - if "data.embeddings" in self._sa_metadata_data_schema.tables: - sa_metadata_data_embeddings_table = self._sa_metadata_data_schema.tables[ - "data.embeddings" - ] - sa_metadata_data_embeddings_table.c.embedding_vector.type = ARRAY(FLOAT) + # set up embeddings views + self._setup_embeddings_views() @property def data_embeddings_root(self) -> str: return f"{self.timdex_dataset.location.removesuffix('/')}/data/embeddings" - def get_sa_table(self, table: str) -> Table: - """Get SQLAlchemy Table from reflected SQLAlchemy metadata.""" - schema_table = f"data.{table}" - if schema_table not in self._sa_metadata_data_schema.tables: - raise ValueError(f"Could not find table '{table}' in DuckDB schema 'data'.") - return self._sa_metadata_data_schema.tables[schema_table] - - def setup_duckdb_context(self) -> DuckDBPyConnection: - """Create a DuckDB connection for embeddings query and retrieval. - - This method extends TIMDEXDatasetMetadata's pre-existing DuckDB connection - (via the attached TIMDEXDataset), creating views in the 'data' schema. - """ + def _setup_embeddings_views(self) -> None: + """Set up embeddings views in the 'data' schema.""" start_time = time.perf_counter() - conn = self.timdex_dataset.conn - try: - self._create_embeddings_view(conn) - self._create_current_embeddings_view(conn) - self._create_current_run_embeddings_view(conn) + self._create_embeddings_view(self.conn) + self._create_current_embeddings_view(self.conn) + self._create_current_run_embeddings_view(self.conn) except DuckDBIOException: - logger.warning("No embeddings found") + logger.debug("No embeddings parquet files found") except Exception as exception: # noqa: BLE001 - logger.warning(f"An error occurred while creating views: {exception}") + logger.warning(f"Error creating embeddings views: {exception}") logger.debug( - "DuckDB context created for TIMDEXEmbeddings, " + "Embeddings views setup for TIMDEXEmbeddings, " f"{round(time.perf_counter()-start_time,2)}s" ) - return conn def _create_embeddings_view(self, conn: DuckDBPyConnection) -> None: """Create a view that projects over embeddings parquet files.""" @@ -408,8 +386,8 @@ def _build_query( fetch results. Always joins to metadata.records to enable filtering by metadata columns (source, run_date, run_type, action, run_timestamp). """ - embeddings_table = self.get_sa_table(table) - metadata_table = self.timdex_dataset.metadata.get_sa_table("records") + embeddings_table = self.timdex_dataset.get_sa_table("data", table) + metadata_table = self.timdex_dataset.get_sa_table("metadata", "records") # select specific columns or default to all from embeddings + metadata if columns: diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py index ca0317a..37a20d8 100644 --- a/timdex_dataset_api/metadata.py +++ b/timdex_dataset_api/metadata.py @@ -4,25 +4,22 @@ import shutil import tempfile import time -from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Literal, Unpack, cast -from urllib.parse import urlparse -import duckdb from duckdb import DuckDBPyConnection from duckdb_engine import Dialect as DuckDBDialect -from sqlalchemy import Table, func, literal, select, text, tuple_ +from sqlalchemy import func, literal, select, text, tuple_ from timdex_dataset_api.config import configure_logger from timdex_dataset_api.utils import ( + DuckDBConnectionFactory, S3Client, build_filter_expr_sa, - sa_reflect_duckdb_conn, ) if TYPE_CHECKING: - from timdex_dataset_api.dataset import DatasetFilters + from timdex_dataset_api.dataset import DatasetFilters, TIMDEXDataset logger = configure_logger(__name__) @@ -39,54 +36,35 @@ ] -@dataclass -class TIMDEXDatasetMetadataConfig: - """Configurations for metadata operations. - - - duckdb_connection_memory_limit: Memory limit for DuckDB connection - - duckdb_connection_threads: Thread limit for DuckDB connection - """ - - duckdb_connection_memory_limit: str = field( - default_factory=lambda: os.getenv("TDA_DUCKDB_MEMORY_LIMIT", "4GB") - ) - duckdb_connection_threads: int = field( - default_factory=lambda: int(os.getenv("TDA_DUCKDB_THREADS", "8")) - ) - - class TIMDEXDatasetMetadata: - def __init__( - self, - location: str, - *, - preload_current_records: bool = False, - ) -> None: + def __init__(self, timdex_dataset: "TIMDEXDataset") -> None: """Init TIMDEXDatasetMetadata. Args: - location: root location of TIMDEX dataset, e.g. 's3://timdex/dataset' - preload_current_records: if True, create in-memory temp table for - current_records (faster for repeated queries); if False, create view only - (default, lower memory) + timdex_dataset: parent TIMDEXDataset instance """ - self.location = location - self.config = TIMDEXDatasetMetadataConfig() - self.preload_current_records = preload_current_records + self.timdex_dataset = timdex_dataset + self.conn = timdex_dataset.conn self.create_metadata_structure() - self.conn: DuckDBPyConnection = self.setup_duckdb_context() - self._sa_metadata = sa_reflect_duckdb_conn(self.conn, schema="metadata") + self._setup_metadata_schema() + + @property + def location(self) -> str: + return self.timdex_dataset.location @property def location_scheme(self) -> Literal["file", "s3"]: - scheme = urlparse(self.location).scheme - if scheme == "": - return "file" - if scheme == "s3": - return "s3" - raise ValueError(f"Location with scheme type '{scheme}' not supported.") + return self.timdex_dataset.location_scheme + + @property + def config(self) -> "TIMDEXDataset.config": # type: ignore[name-defined] + return self.timdex_dataset.config + + @property + def preload_current_records(self) -> bool: + return self.timdex_dataset.preload_current_records @property def metadata_root(self) -> str: @@ -138,7 +116,7 @@ def append_deltas_count(self) -> int: ] # type: ignore[index] def create_metadata_structure(self) -> None: - """Ensure metadata structure exists in TIDMEX dataset..""" + """Ensure metadata structure exists in TIMDEX dataset.""" if self.location_scheme == "file": Path(self.metadata_database_path).parent.mkdir( parents=True, @@ -149,91 +127,6 @@ def create_metadata_structure(self) -> None: exist_ok=True, ) - def configure_duckdb_connection(self, conn: DuckDBPyConnection) -> None: - """Configure a DuckDB connection/context. - - These configurations include things like memory settings, AWS authentication, etc. - """ - self._install_duckdb_extensions(conn) - self._configure_duckdb_s3_secret(conn) - self._configure_duckdb_memory_profile(conn) - - def _install_duckdb_extensions(self, conn: DuckDBPyConnection) -> None: - """Ensure DuckDB capable of installing extensions and install any required.""" - # ensure secrets and extensions paths are accessible - home_env = os.getenv("HOME") - use_fallback_home = not home_env or not Path(home_env).is_dir() - - if use_fallback_home: - duckdb_home = Path("/tmp/.duckdb") # noqa: S108 - secrets_dir = duckdb_home / "secrets" - extensions_dir = duckdb_home / "extensions" - - secrets_dir.mkdir(parents=True, exist_ok=True) - extensions_dir.mkdir(parents=True, exist_ok=True) - - conn.execute(f"set secret_directory='{secrets_dir.as_posix()}';") - conn.execute(f"set extension_directory='{extensions_dir.as_posix()}';") - - # install HTTPFS extension - conn.execute( - """ - install httpfs; - load httpfs; - """ - ) - - def _configure_duckdb_s3_secret( - self, - conn: DuckDBPyConnection, - scope: str | None = None, - ) -> None: - """Configure a secret in a DuckDB connection for S3 access. - - If a scope is provided, e.g. an S3 URI prefix like 's3://timdex', set a scope - parameter in the config. Else, leave it blank. - """ - # establish scope string - scope_str = f", scope '{scope}'" if scope else "" - - if os.getenv("MINIO_S3_ENDPOINT_URL"): - conn.execute( - f""" - create or replace secret minio_s3_secret ( - type s3, - endpoint '{urlparse(os.environ["MINIO_S3_ENDPOINT_URL"]).netloc}', - key_id '{os.environ["MINIO_USERNAME"]}', - secret '{os.environ["MINIO_PASSWORD"]}', - region 'us-east-1', - url_style 'path', - use_ssl false - {scope_str} - ); - """ - ) - - elif self.location_scheme == "s3": - conn.execute( - f""" - create or replace secret aws_s3_secret ( - type s3, - provider credential_chain, - refresh true - {scope_str} - ); - """ - ) - - def _configure_duckdb_memory_profile(self, conn: DuckDBPyConnection) -> None: - conn.execute( - f""" - set enable_external_file_cache = false; - set memory_limit = '{self.config.duckdb_connection_memory_limit}'; - set threads = {self.config.duckdb_connection_threads}; - set preserve_insertion_order=false; - """ - ) - def database_exists(self) -> bool: """Check if static metadata database file exists.""" if self.location_scheme == "s3": @@ -241,20 +134,6 @@ def database_exists(self) -> bool: return s3_client.object_exists(self.metadata_database_path) return os.path.exists(self.metadata_database_path) - def get_sa_table(self, table: str) -> Table: - """Get SQLAlchemy Table from reflected SQLAlchemy metadata.""" - schema_table = f"metadata.{table}" - if schema_table not in self._sa_metadata.tables: - raise ValueError( - f"Could not find table '{table}' in DuckDB schema 'metadata'." - ) - return self._sa_metadata.tables[schema_table] - - def refresh(self) -> None: - """Refresh DuckDB connection and reflected SQLAlchemy metadata on self.""" - self.conn = self.setup_duckdb_context() - self._sa_metadata = sa_reflect_duckdb_conn(self.conn, schema="metadata") - def rebuild_dataset_metadata(self) -> None: """Fully rebuild dataset metadata. @@ -274,9 +153,8 @@ def rebuild_dataset_metadata(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: local_db_path = str(Path(temp_dir) / self.metadata_database_filename) - with duckdb.connect(local_db_path) as conn: - self.configure_duckdb_connection(conn) - + factory = DuckDBConnectionFactory(location_scheme=self.location_scheme) + with factory.create_connection(local_db_path) as conn: self._create_full_dataset_table(conn) # copy local database file to remote location @@ -289,8 +167,8 @@ def rebuild_dataset_metadata(self) -> None: else: shutil.copy(local_db_path, self.metadata_database_path) - # refresh DuckDB connection - self.conn = self.setup_duckdb_context() + # refresh dataset to pick up new metadata + self.timdex_dataset.refresh() def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None: """Create a table of metadata for all records in the ETL parquet dataset. @@ -319,7 +197,7 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None: conn.execute(query) # reset thread count - conn.execute(f"""SET threads = {self.config.duckdb_connection_threads};""") + conn.execute(f"""SET threads = {self.timdex_dataset.conn_factory.threads};""") row_count = conn.query("""select count(*) from records;""").fetchone()[0] # type: ignore[index] logger.info( @@ -327,45 +205,30 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None: f"elapsed: {time.perf_counter() - start_time}" ) - def setup_duckdb_context(self) -> DuckDBPyConnection: - """Create a DuckDB connection that provides full dataset metadata information. - - The following work is performed: - 1. Attach to static metadata database file. - 2. Create views that union static metadata with any append deltas. - 3. Create additional metadata views as needed. - - The resulting, in-memory DuckDB connection is used for all metadata queries. + def _setup_metadata_schema(self) -> None: + """Set up metadata schema views in the DuckDB connection. - If a static database file is not found, a configured DuckDB connection is still - returned. + Creates views for accessing static metadata DB and append deltas. + If static DB doesn't exist, logs warning but doesn't fail. """ start_time = time.perf_counter() - conn = duckdb.connect() - conn.execute("""SET enable_progress_bar = false;""") - self.configure_duckdb_connection(conn) - if not self.database_exists(): logger.warning( f"Static metadata database not found @ '{self.metadata_database_path}'. " - "Please recreate via TIMDEXDatasetMetadata.recreate_database_file()." + "Consider rebuild via TIMDEXDataset.metadata.rebuild_dataset_metadata()." ) - return conn - - # create metadata schema - conn.execute("create schema metadata;") + return - self._attach_database_file(conn) - self._create_append_deltas_view(conn) - self._create_records_union_view(conn) - self._create_current_records_view(conn) + self._attach_database_file(self.conn) + self._create_append_deltas_view(self.conn) + self._create_records_union_view(self.conn) + self._create_current_records_view(self.conn) logger.debug( - "DuckDB context created for TIMDEXDatasetMetadata, " + "Metadata schema setup for TIMDEXDatasetMetadata, " f"{round(time.perf_counter()-start_time,2)}s" ) - return conn def _attach_database_file(self, conn: DuckDBPyConnection) -> None: """Readonly attach to static metadata database. @@ -649,7 +512,7 @@ def build_keyset_paginated_metadata_query( **filters: Unpack["DatasetFilters"], ) -> str: """Build SQL query using SQLAlchemy against metadata schema tables and views.""" - sa_table = self.get_sa_table(table) + sa_table = self.timdex_dataset.get_sa_table("metadata", table) # create SQL statement object stmt = select( diff --git a/timdex_dataset_api/utils.py b/timdex_dataset_api/utils.py index 4a9ba08..6fea970 100644 --- a/timdex_dataset_api/utils.py +++ b/timdex_dataset_api/utils.py @@ -5,11 +5,13 @@ import pathlib import time from datetime import UTC, date, datetime -from typing import TYPE_CHECKING, Any +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal from urllib.parse import urlparse import boto3 -from duckdb import DuckDBPyConnection # type: ignore[import-untyped] +import duckdb +from duckdb import DuckDBPyConnection from duckdb_engine import ConnectionWrapper from sqlalchemy import ( MetaData, @@ -106,6 +108,109 @@ def _split_s3_uri(s3_uri: str) -> tuple[str, str]: return bucket, key +class DuckDBConnectionFactory: + """Factory for creating and configuring DuckDB connections. + + Args: + location_scheme: "file" or "s3", determines S3 credential setup + memory_limit: DuckDB memory limit (env: TDA_DUCKDB_MEMORY_LIMIT, default: "4GB") + threads: DuckDB thread limit (env: TDA_DUCKDB_THREADS, default: 8) + """ + + def __init__( + self, + location_scheme: Literal["file", "s3"] = "file", + memory_limit: str | None = None, + threads: int | None = None, + ): + self.location_scheme = location_scheme + self.memory_limit = memory_limit or os.getenv("TDA_DUCKDB_MEMORY_LIMIT", "4GB") + self.threads = threads or int(os.getenv("TDA_DUCKDB_THREADS", "8")) + + def create_connection(self, path: str = ":memory:") -> DuckDBPyConnection: + """Create a new configured DuckDB connection. + + Args: + path: Database file path or ":memory:" for in-memory database (default) + """ + start_time = time.perf_counter() + conn = duckdb.connect(path) + conn.execute("SET enable_progress_bar = false;") + self.configure_connection(conn) + logger.debug( + f"DuckDB connection created, {round(time.perf_counter()-start_time,2)}s" + ) + return conn + + def configure_connection(self, conn: DuckDBPyConnection) -> None: + """Configure an existing DuckDB connection.""" + self._install_extensions(conn) + self._configure_s3_secret(conn) + self._configure_memory_profile(conn) + + def _install_extensions(self, conn: DuckDBPyConnection) -> None: + """Ensure DuckDB capable of installing extensions and install any required.""" + home_env = os.getenv("HOME") + use_fallback_home = not home_env or not Path(home_env).is_dir() + + if use_fallback_home: + duckdb_home = Path("/tmp/.duckdb") # noqa: S108 + secrets_dir = duckdb_home / "secrets" + extensions_dir = duckdb_home / "extensions" + + secrets_dir.mkdir(parents=True, exist_ok=True) + extensions_dir.mkdir(parents=True, exist_ok=True) + + conn.execute(f"set secret_directory='{secrets_dir.as_posix()}';") + conn.execute(f"set extension_directory='{extensions_dir.as_posix()}';") + + conn.execute( + """ + install httpfs; + load httpfs; + """ + ) + + def _configure_s3_secret(self, conn: DuckDBPyConnection) -> None: + """Configure a secret in a DuckDB connection for S3 access.""" + if os.getenv("MINIO_S3_ENDPOINT_URL"): + conn.execute( + f""" + create or replace secret minio_s3_secret ( + type s3, + endpoint '{urlparse(os.environ["MINIO_S3_ENDPOINT_URL"]).netloc}', + key_id '{os.environ["MINIO_USERNAME"]}', + secret '{os.environ["MINIO_PASSWORD"]}', + region 'us-east-1', + url_style 'path', + use_ssl false + ); + """ + ) + + elif self.location_scheme == "s3": + conn.execute( + """ + create or replace secret aws_s3_secret ( + type s3, + provider credential_chain, + refresh true + ); + """ + ) + + def _configure_memory_profile(self, conn: DuckDBPyConnection) -> None: + """Configure DuckDB memory and thread settings.""" + conn.execute( + f""" + set enable_external_file_cache = false; + set memory_limit = '{self.memory_limit}'; + set threads = {self.threads}; + set preserve_insertion_order=false; + """ + ) + + def sa_reflect_duckdb_conn( conn: DuckDBPyConnection, schema: str | None = None ) -> MetaData: From efccf88467120c7dd07b6bb5b3436bc898a09e66 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 19 Dec 2025 09:46:48 -0500 Subject: [PATCH 3/3] Version bump to 3.9 --- timdex_dataset_api/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py index e713149..fb35f27 100644 --- a/timdex_dataset_api/__init__.py +++ b/timdex_dataset_api/__init__.py @@ -5,7 +5,7 @@ from timdex_dataset_api.metadata import TIMDEXDatasetMetadata from timdex_dataset_api.record import DatasetRecord -__version__ = "3.8.0" +__version__ = "3.9.0" __all__ = [ "DatasetEmbedding",