From 23d01bde944867cf1017ff0a084ae98a70e580d2 Mon Sep 17 00:00:00 2001 From: Sachin Panayil Date: Mon, 30 Jun 2025 15:09:39 -0400 Subject: [PATCH 1/2] fixed data parsing Signed-off-by: Sachin Panayil --- config/updateCodeGov.js | 10 +- config/updateIssuePool.js | 24 +++-- issue-pool.json | 221 +++++++++++++++++++++++++++++++++++++- 3 files changed, 234 insertions(+), 21 deletions(-) diff --git a/config/updateCodeGov.js b/config/updateCodeGov.js index ff713714..492a88fc 100644 --- a/config/updateCodeGov.js +++ b/config/updateCodeGov.js @@ -3,7 +3,6 @@ const path = require('path') // these will always stay constant const CONFIG = { - testAgencyDirectory: path.resolve(__dirname, "../test-agency-indexes"), agencyDirectory: path.resolve(__dirname, "../agency-indexes"), outputFile: path.resolve(__dirname, "../codegov.json"), regex: /^(.*?)-.*\.json$/ @@ -14,17 +13,16 @@ i focused on the fact that the agencyDirectory will always contain jsons with th but that prove to be a disadvantage down the road */ // updates the codegov.json file with new data found from ./agency-indexes -async function updateCodeGov(isTesting = false) { +async function updateCodeGov() { try { const updatedJSON = {} - directoryPath = isTesting === true ? CONFIG.testAgencyDirectory : CONFIG.agencyDirectory // read all files in the directory - const filenames = await fs.readdir(directoryPath) + const filenames = await fs.readdir(CONFIG.agencyDirectory) // we know that the directory will only contain json files so dont need to check for non jsons for (const file of filenames) { - const filePath = path.join(directoryPath, file) + const filePath = path.join(CONFIG.agencyDirectory, file) try { const content = await fs.readFile(filePath, "utf-8") @@ -51,4 +49,4 @@ async function updateCodeGov(isTesting = false) { } } -updateCodeGov(isTesting = true) \ No newline at end of file +updateCodeGov() \ No newline at end of file diff --git a/config/updateIssuePool.js b/config/updateIssuePool.js index 2dd3fbac..d6378bb8 100644 --- a/config/updateIssuePool.js +++ b/config/updateIssuePool.js @@ -23,7 +23,7 @@ async function getRepoInfo() { // dont know how i feel about this double loop se let repoInfo = [] try { - const content = await fs.readFile(CONFIG.repoFilePath, "utf-8") // filter by tier 3 maturity to get the projects that truly want outside help + const content = await fs.readFile(CONFIG.repoFilePath, "utf-8") const jsonData = JSON.parse(content) for (const agencyKey in jsonData) { @@ -34,12 +34,17 @@ async function getRepoInfo() { // dont know how i feel about this double loop se if (organization.repositoryURL) { const match = organization.repositoryURL.match(CONFIG.regex) - const [url, owner, repo] = match - repoInfo.push({ - ownerName: owner, - repoName: repo - }) + if (match) { + const [url, owner, repo] = match + + repoInfo.push({ + ownerName: owner, + repoName: repo + }) + } else { + console.warn(`No match found for URL: ${organization.repositoryURL}`) + } } } } @@ -143,14 +148,14 @@ async function updateIssuePool() { const issues = await issuesResponse.json() // endpoint always returns both issues and pull requests so we ignore the PRs - for (const issue of issues) { + for (const [index, issue] of issues.entries()) { if (issue.pull_request) { continue } - const transformedIssue = transformIssue(issue, repo, repoLanguage) issuePool[transformedIssue.id] = transformedIssue // is having the ID is the best key name? + console.log(`✅ Processed ${index}/${issues.length}: ${repo.ownerName}/${issue.repoName}`) } if (issues.length < 100) { @@ -159,9 +164,6 @@ async function updateIssuePool() { page++ } - - console.log(`✅ Processed ${i + 1}/${repoInfo.length}: ${repo.ownerName}/${repo.repoName}`) - } catch (error) { console.error(`❌ Error processing ${repo.ownerName}/${repo.repoName}:`, error) continue diff --git a/issue-pool.json b/issue-pool.json index 47b917f4..94fc1709 100644 --- a/issue-pool.json +++ b/issue-pool.json @@ -14,8 +14,8 @@ "status_is_locked": false, "time_created_date": "2025-02-06T18:28:23Z", "time_last_updated": "2025-06-25T20:07:44Z", - "time_days_old": 140, - "time_last_activity_days_ago": 1, + "time_days_old": 144, + "time_last_activity_days_ago": 5, "people_author": "natalialuzuriaga", "people_assignee": null, "people_author_type": "User", @@ -51,8 +51,8 @@ "status_is_locked": false, "time_created_date": "2025-02-25T14:57:54Z", "time_last_updated": "2025-06-25T20:02:52Z", - "time_days_old": 121, - "time_last_activity_days_ago": 1, + "time_days_old": 126, + "time_last_activity_days_ago": 5, "people_author": "decause-gov", "people_assignee": null, "people_author_type": "User", @@ -74,5 +74,218 @@ "flags_is_bug": false, "flags_is_feature": true, "flags_is_stale": false + }, + "3008914624": { + "id": "3008914624", + "number": 47, + "url": "https://github.com/DSACMS/codejson-generator/issues/47", + "content_title": "Form: Updating styling", + "content_description": "## Issue Report\n\nForm: Updating styling\n\n### Expected behavior\n\n- We would like form to match the styling of the SHARE IT Landing Page for consistency in branding: https://dsacms.github.io/share-it-act-lp/\n\n### Actual behavior\n\n- Currently uses custom CSS and bootstrap on the frontend.\n", + "repo_name": "codejson-generator", + "repo_url": "https://github.com/DSACMS/codejson-generator", + "repo_language": "JavaScript", + "repo_owner": "DSACMS", + "status_is_open": true, + "status_has_assignee": false, + "status_is_locked": false, + "time_created_date": "2025-04-21T17:29:10Z", + "time_last_updated": "2025-06-25T20:09:16Z", + "time_days_old": 71, + "time_last_activity_days_ago": 5, + "people_author": "natalialuzuriaga", + "people_assignee": null, + "people_author_type": "User", + "labels_list": [ + "enhancement", + "code-gov" + ], + "labels_count": 2, + "labels_has_priority": false, + "labels_has_difficulty": false, + "engagement_comment_count": 0, + "engagement_reaction_count": 0, + "engagement_score": 0, + "flags_is_beginner_friendly": false, + "flags_needs_help": false, + "flags_is_bug": false, + "flags_is_feature": true, + "flags_is_stale": false + }, + "3018169397": { + "id": "3018169397", + "number": 49, + "url": "https://github.com/DSACMS/codejson-generator/issues/49", + "content_title": "VCS agnostic auto generation", + "content_description": "Currently the auto generation only works with the use of GitHub. We should be agnostic to all version control systems. #48 has more discussion on this in the comments", + "repo_name": "codejson-generator", + "repo_url": "https://github.com/DSACMS/codejson-generator", + "repo_language": "JavaScript", + "repo_owner": "DSACMS", + "status_is_open": true, + "status_has_assignee": false, + "status_is_locked": false, + "time_created_date": "2025-04-24T18:48:05Z", + "time_last_updated": "2025-06-25T20:09:17Z", + "time_days_old": 67, + "time_last_activity_days_ago": 5, + "people_author": "sachin-panayil", + "people_assignee": null, + "people_author_type": "User", + "labels_list": [ + "code-gov" + ], + "labels_count": 1, + "labels_has_priority": false, + "labels_has_difficulty": false, + "engagement_comment_count": 0, + "engagement_reaction_count": 0, + "engagement_score": 0, + "flags_is_beginner_friendly": false, + "flags_needs_help": false, + "flags_is_bug": false, + "flags_is_feature": false, + "flags_is_stale": false + }, + "3018171660": { + "id": "3018171660", + "number": 50, + "url": "https://github.com/DSACMS/codejson-generator/issues/50", + "content_title": "Take into consideration multiple LICENSE URLs", + "content_description": "IDEA: Have a 'checker' that detects whether all URLs return a 200 when the generate button is clicked.\n\n#48 for more reference", + "repo_name": "codejson-generator", + "repo_url": "https://github.com/DSACMS/codejson-generator", + "repo_language": "JavaScript", + "repo_owner": "DSACMS", + "status_is_open": true, + "status_has_assignee": false, + "status_is_locked": false, + "time_created_date": "2025-04-24T18:48:57Z", + "time_last_updated": "2025-06-25T20:09:17Z", + "time_days_old": 67, + "time_last_activity_days_ago": 5, + "people_author": "sachin-panayil", + "people_assignee": null, + "people_author_type": "User", + "labels_list": [ + "code-gov" + ], + "labels_count": 1, + "labels_has_priority": false, + "labels_has_difficulty": false, + "engagement_comment_count": 0, + "engagement_reaction_count": 0, + "engagement_score": 0, + "flags_is_beginner_friendly": false, + "flags_needs_help": false, + "flags_is_bug": false, + "flags_is_feature": false, + "flags_is_stale": false + }, + "3018174794": { + "id": "3018174794", + "number": 51, + "url": "https://github.com/DSACMS/codejson-generator/issues/51", + "content_title": "Support multiple different schemas depending on user", + "content_description": "Have multiple different schemas for the form and use a drop down to select them to support different agencies\n\n#48 for more reference", + "repo_name": "codejson-generator", + "repo_url": "https://github.com/DSACMS/codejson-generator", + "repo_language": "JavaScript", + "repo_owner": "DSACMS", + "status_is_open": true, + "status_has_assignee": false, + "status_is_locked": false, + "time_created_date": "2025-04-24T18:50:22Z", + "time_last_updated": "2025-06-25T20:09:18Z", + "time_days_old": 67, + "time_last_activity_days_ago": 5, + "people_author": "sachin-panayil", + "people_assignee": null, + "people_author_type": "User", + "labels_list": [ + "code-gov" + ], + "labels_count": 1, + "labels_has_priority": false, + "labels_has_difficulty": false, + "engagement_comment_count": 0, + "engagement_reaction_count": 0, + "engagement_score": 0, + "flags_is_beginner_friendly": false, + "flags_needs_help": false, + "flags_is_bug": false, + "flags_is_feature": false, + "flags_is_stale": false + }, + "3087309433": { + "id": "3087309433", + "number": 31, + "url": "https://github.com/DSACMS/automated-codejson-generator/issues/31", + "content_title": "Shallow merge logic allows for improper merge", + "content_description": "Since the logic currently runs a shallow merge, we sometimes lose the manual inputs of nested objects and arrays since they are not included in the merge properly. We need to create a way to merge each field properly regardless of the user scenario ", + "repo_name": "automated-codejson-generator", + "repo_url": "https://github.com/DSACMS/automated-codejson-generator", + "repo_language": "TypeScript", + "repo_owner": "DSACMS", + "status_is_open": true, + "status_has_assignee": false, + "status_is_locked": false, + "time_created_date": "2025-05-23T19:15:55Z", + "time_last_updated": "2025-06-25T20:02:03Z", + "time_days_old": 38, + "time_last_activity_days_ago": 5, + "people_author": "sachin-panayil", + "people_assignee": null, + "people_author_type": "User", + "labels_list": [ + "bug", + "code-gov" + ], + "labels_count": 2, + "labels_has_priority": false, + "labels_has_difficulty": false, + "engagement_comment_count": 0, + "engagement_reaction_count": 0, + "engagement_score": 0, + "flags_is_beginner_friendly": false, + "flags_needs_help": false, + "flags_is_bug": true, + "flags_is_feature": false, + "flags_is_stale": false + }, + "3104214028": { + "id": "3104214028", + "number": 33, + "url": "https://github.com/DSACMS/automated-codejson-generator/issues/33", + "content_title": "Outputs don't seem to be there", + "content_description": "Hi! First of all, thank you so much for building this! \n\nI'm on MDCT and it works awesome as intended. I tried to automate further so that it would automate the merging of the PR this project produces. There are some issues related to that that I've hit but in the process of bumbling around I noticed that this file:\nhttps://github.com/DSACMS/automated-codejson-generator/blob/main/action.yml\nmade me think that I could expect outputs:\n```yml\noutputs:\n updated:\n description: \"Boolean indicating whether code.json was updated\"\n pr_url:\n description: \"URL of the created pull request if changes were made\"\n```\nto be present such that in my own update-code-json.yml file I could do something like this:\n```yml\nname: Update Code.json\non:\n workflow_dispatch:\n schedule:\n - cron: \"0 0 1 * *\" # monthly on day 1\n\npermissions:\n contents: write\n pull-requests: write\n\njobs:\n update-code-json:\n runs-on: ubuntu-latest\n steps:\n\n ...\n\n - name: Update code.json\n id: update\n uses: DSACMS/automated-codejson-generator@main\n with:\n GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n BRANCH: \"main\"\n\n - name: Print PR url\n run: |\n echo \"${{steps.update.outputs.pr_url}}\"\n```\nIt's not stopping me anymore because I got something like this thrown together:\n```yml\n - name: Print PR url\n run: |\n PR_URL=$(gh pr list --state open --json url,headRefName \\\n --jq '.[] | select(.headRefName | startswith(\"code-json-\")) | .url' \\\n | head -n 1)\n echo \"Found PR: $PR_URL\"\n```\nFeel free to remove or close this issue if it isn't helpful and thanks again for building this, it's been super helpful!", + "repo_name": "automated-codejson-generator", + "repo_url": "https://github.com/DSACMS/automated-codejson-generator", + "repo_language": "TypeScript", + "repo_owner": "DSACMS", + "status_is_open": true, + "status_has_assignee": false, + "status_is_locked": false, + "time_created_date": "2025-05-30T20:29:51Z", + "time_last_updated": "2025-06-25T20:02:14Z", + "time_days_old": 31, + "time_last_activity_days_ago": 5, + "people_author": "peoplespete", + "people_assignee": null, + "people_author_type": "User", + "labels_list": [ + "good first issue", + "code-gov" + ], + "labels_count": 2, + "labels_has_priority": false, + "labels_has_difficulty": false, + "engagement_comment_count": 2, + "engagement_reaction_count": 1, + "engagement_score": 3, + "flags_is_beginner_friendly": true, + "flags_needs_help": false, + "flags_is_bug": false, + "flags_is_feature": false, + "flags_is_stale": false } } \ No newline at end of file From 76543315c82aa5106fae83ddbaecc963c9d9560c Mon Sep 17 00:00:00 2001 From: Sachin Panayil Date: Mon, 30 Jun 2025 16:49:29 -0400 Subject: [PATCH 2/2] added basic rate limiting and paralle processing Signed-off-by: Sachin Panayil --- config/updateIssuePool.js | 133 +++++++++++++++++++++++++------------- issue-pool.json | 28 ++++---- 2 files changed, 102 insertions(+), 59 deletions(-) diff --git a/config/updateIssuePool.js b/config/updateIssuePool.js index d6378bb8..c157a63e 100644 --- a/config/updateIssuePool.js +++ b/config/updateIssuePool.js @@ -6,7 +6,10 @@ const CONFIG = { issueFilePath: path.resolve(__dirname, "../issue-pool.json"), regex: /https?:\/\/github\.com\/([^\/]+)\/([^\/]+)/, githubToken: process.env.GITHUB_TOKEN, - requiredLabel: 'code-gov' + requiredLabel: 'code-gov', + concurrentRepos: 6, // processing 6 repos at once but need to find the sweetspot because at this rate, it takes 18 minutes for the entire script to run through codegov.json. the "bathtub curve" is what we have here and what we need to experiment with and solve 👀 + rateLimitRemaining: 5000, + rateLimitReset: Date.now } // #region - Helper Functions @@ -19,6 +22,24 @@ const getHeaders = () => { return HEADERS } +async function fetchWithRateLimit(url, options = {}) { + if (CONFIG.rateLimitRemaining <= 10 && Date.now() < CONFIG.rateLimitReset) { + const waitTime = CONFIG.rateLimitReset - Date.now() + 1000 // add 1 second buffer + console.log(`Rate limit low (${CONFIG.rateLimitRemaining} remaining). Waiting ${Math.round(waitTime/1000)}s...`) + await new Promise(resolve => setTimeout(resolve, waitTime)) + } + + const response = await fetch(url, options) + + const remainingHeader = response.headers.get('X-RateLimit-Remaining') + const resetHeader = response.headers.get('X-RateLimit-Reset') + + if (remainingHeader) CONFIG.rateLimitRemaining = parseInt(remainingHeader) + if (resetHeader) CONFIG.rateLimitReset = parseInt(resetHeader) * 1000 + + return response +} + async function getRepoInfo() { // dont know how i feel about this double loop setup... let repoInfo = [] @@ -112,61 +133,83 @@ function transformIssue(issue, repo, repoLanguage) { } } -// #region - Main Function -async function updateIssuePool() { - const issuePool = {} - const repoInfo = await getRepoInfo() - const headers = getHeaders() - - for (let i = 0; i < repoInfo.length; i++) { // switch to a forOf loop here? - const repo = repoInfo[i] +async function processSingleRepository(repo, headers) { + const repoIssues = {} + + try { + const repoUrl = `https://api.github.com/repos/${repo.ownerName}/${repo.repoName}` + const repoResponse = await fetchWithRateLimit(repoUrl, { headers }) - try { - const repoUrl = `https://api.github.com/repos/${repo.ownerName}/${repo.repoName}` - const repoResponse = await fetch(repoUrl, { headers }) + if (!repoResponse.ok) { + console.error(`Failed to fetch repo info for ${repo.ownerName}/${repo.repoName}: ${repoResponse.status}`) + return repoIssues + } - if (!repoResponse.ok) { - console.error(`Failed to fetch repo info for ${repo.ownerName}/${repo.repoName}: ${repoResponse.status}`) - continue - } + const repoData = await repoResponse.json() + const repoLanguage = repoData.language || "" - const repoData = await repoResponse.json() - const repoLanguage = repoData.language || "" + let page = 1 + let hasMore = true - let page = 1 - let hasMore = true + while (hasMore) { + const issuesUrl = `https://api.github.com/repos/${repo.ownerName}/${repo.repoName}/issues?page=${page}&per_page=100&state=open&labels=${CONFIG.requiredLabel}` + const issuesResponse = await fetchWithRateLimit(issuesUrl, { headers }) - while (hasMore) { - const issuesUrl = `https://api.github.com/repos/${repo.ownerName}/${repo.repoName}/issues?page=${page}&per_page=100&state=open&labels=${CONFIG.requiredLabel}` - const issuesResponse = await fetch(issuesUrl, { headers }) + if (!issuesResponse.ok) { + console.error(`Failed to fetch issues for ${repo.ownerName}/${repo.repoName}: ${issuesResponse.status}`) + break + } - if (!issuesResponse.ok) { - console.error(`Failed to fetch issues for ${repo.ownerName}/${repo.repoName}: ${issuesResponse.status}`) - break + const issues = await issuesResponse.json() + + // endpoint always returns both issues and pull requests so we ignore the PRs + for (const [index, issue] of issues.entries()) { + if (issue.pull_request) { + continue } - - const issues = await issuesResponse.json() - // endpoint always returns both issues and pull requests so we ignore the PRs - for (const [index, issue] of issues.entries()) { - if (issue.pull_request) { - continue - } - - const transformedIssue = transformIssue(issue, repo, repoLanguage) - issuePool[transformedIssue.id] = transformedIssue // is having the ID is the best key name? - console.log(`✅ Processed ${index}/${issues.length}: ${repo.ownerName}/${issue.repoName}`) - } + const transformedIssue = transformIssue(issue, repo, repoLanguage) + repoIssues[transformedIssue.id] = transformedIssue // is having the ID is the best key name? + console.log(`✅ Processed ${index + 1}/${issues.length}: ${repo.ownerName}/${repo.repoName}`) + } - if (issues.length < 100) { - hasMore = false - } + if (issues.length < 100) { + hasMore = false + } + + page++ + } + } catch (error) { + console.error(`❌ Error processing ${repo.ownerName}/${repo.repoName}:`, error) + } - page++ + return repoIssues +} + +// #region - Main Function +async function updateIssuePool() { + const issuePool = {} + const repoInfo = await getRepoInfo() + const headers = getHeaders() + + // process repositories in chunks of 3 for parallel processing + for (let i = 0; i < repoInfo.length; i += CONFIG.concurrentRepos) { + const chunk = repoInfo.slice(i, i + CONFIG.concurrentRepos) + console.log(`Processing chunk ${Math.floor(i/CONFIG.concurrentRepos) + 1}/${Math.ceil(repoInfo.length/CONFIG.concurrentRepos)} (${chunk.length} repos)`) + + const chunkPromises = chunk.map(repo => processSingleRepository(repo, headers)) + const chunkResults = await Promise.allSettled(chunkPromises) + + chunkResults.forEach((result, index) => { + if (result.status === 'fulfilled') { + Object.assign(issuePool, result.value) + } else { + console.error(`Failed ${chunk[index].ownerName}/${chunk[index].repoName}:`, result.reason) } - } catch (error) { - console.error(`❌ Error processing ${repo.ownerName}/${repo.repoName}:`, error) - continue + }) + + if (i + CONFIG.concurrentRepos < repoInfo.length) { + await new Promise(resolve => setTimeout(resolve, 1000)) } } diff --git a/issue-pool.json b/issue-pool.json index 94fc1709..f2e3650a 100644 --- a/issue-pool.json +++ b/issue-pool.json @@ -14,8 +14,8 @@ "status_is_locked": false, "time_created_date": "2025-02-06T18:28:23Z", "time_last_updated": "2025-06-25T20:07:44Z", - "time_days_old": 144, - "time_last_activity_days_ago": 5, + "time_days_old": 145, + "time_last_activity_days_ago": 6, "people_author": "natalialuzuriaga", "people_assignee": null, "people_author_type": "User", @@ -52,7 +52,7 @@ "time_created_date": "2025-02-25T14:57:54Z", "time_last_updated": "2025-06-25T20:02:52Z", "time_days_old": 126, - "time_last_activity_days_ago": 5, + "time_last_activity_days_ago": 6, "people_author": "decause-gov", "people_assignee": null, "people_author_type": "User", @@ -91,7 +91,7 @@ "time_created_date": "2025-04-21T17:29:10Z", "time_last_updated": "2025-06-25T20:09:16Z", "time_days_old": 71, - "time_last_activity_days_ago": 5, + "time_last_activity_days_ago": 6, "people_author": "natalialuzuriaga", "people_assignee": null, "people_author_type": "User", @@ -126,8 +126,8 @@ "status_is_locked": false, "time_created_date": "2025-04-24T18:48:05Z", "time_last_updated": "2025-06-25T20:09:17Z", - "time_days_old": 67, - "time_last_activity_days_ago": 5, + "time_days_old": 68, + "time_last_activity_days_ago": 6, "people_author": "sachin-panayil", "people_assignee": null, "people_author_type": "User", @@ -161,8 +161,8 @@ "status_is_locked": false, "time_created_date": "2025-04-24T18:48:57Z", "time_last_updated": "2025-06-25T20:09:17Z", - "time_days_old": 67, - "time_last_activity_days_ago": 5, + "time_days_old": 68, + "time_last_activity_days_ago": 6, "people_author": "sachin-panayil", "people_assignee": null, "people_author_type": "User", @@ -196,8 +196,8 @@ "status_is_locked": false, "time_created_date": "2025-04-24T18:50:22Z", "time_last_updated": "2025-06-25T20:09:18Z", - "time_days_old": 67, - "time_last_activity_days_ago": 5, + "time_days_old": 68, + "time_last_activity_days_ago": 6, "people_author": "sachin-panayil", "people_assignee": null, "people_author_type": "User", @@ -231,8 +231,8 @@ "status_is_locked": false, "time_created_date": "2025-05-23T19:15:55Z", "time_last_updated": "2025-06-25T20:02:03Z", - "time_days_old": 38, - "time_last_activity_days_ago": 5, + "time_days_old": 39, + "time_last_activity_days_ago": 6, "people_author": "sachin-panayil", "people_assignee": null, "people_author_type": "User", @@ -267,8 +267,8 @@ "status_is_locked": false, "time_created_date": "2025-05-30T20:29:51Z", "time_last_updated": "2025-06-25T20:02:14Z", - "time_days_old": 31, - "time_last_activity_days_ago": 5, + "time_days_old": 32, + "time_last_activity_days_ago": 6, "people_author": "peoplespete", "people_assignee": null, "people_author_type": "User",