From 4571bbaca19e5f355db50c74df575d4f7a1b5fc0 Mon Sep 17 00:00:00 2001 From: Linsey Passarella Date: Mon, 15 Oct 2018 16:23:22 +0000 Subject: [PATCH 1/2] MiniProj2_part1 --- lpassare.ipynb | 193 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 lpassare.ipynb diff --git a/lpassare.ipynb b/lpassare.ipynb new file mode 100644 index 0000000..d3a3149 --- /dev/null +++ b/lpassare.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import sys\n", + "import re\n", + "import pymongo\n", + "import json\n", + "import time\n", + "import datetime\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "dbname = \"fdac18mp2\" #please use this database\n", + "collname = \"glprj_jdunca51\" #please modify so you store data in your collection\n", + "my_char = 'f'\n", + "\n", + "# beginning page index\n", + "begin = \"1\"\n", + "client = pymongo.MongoClient()\n", + "\n", + "db = client[dbname]\n", + "coll = db[collname]\n", + "\n", + "\n", + "gitlab_url = \"https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=\" + begin + \\\n", + " \"&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false\"\n", + "\n", + "gleft = 20\n", + "\n", + "source_url = \"https://sourceforge.net/directory/?q=\" + my_char + \"&sort=name&page=\"\n", + "rest_url = \"https://sourceforge.net/rest/p/\"\n", + "\n", + "header = {'per_page': 99}\n", + "\n", + "# check remaining query chances for rate-limit restriction\n", + "def wait(left):\n", + " global header\n", + " while (left < 20):\n", + " l = requests.get('https://gitlab.com/api/v4/projects', headers=header)\n", + " if (l.ok):\n", + " left = int(l.headers.get('RateLimit-Remaining'))\n", + " time .sleep(60)\n", + " return left\n", + "\n", + "def project_exists(url):\n", + " r = requests.get(url)\n", + " if r.status_code == 200:\n", + " return True\n", + " return False\n", + "\n", + "def get_source(url, coll, rest):\n", + " page = 1\n", + " project_count = 0\n", + " while True:\n", + " resp = requests.get(url + str(page))\n", + " text = resp.text\n", + " soup = BeautifulSoup(text, 'html.parser')\n", + " if re.search('No results found.', soup.get_text()):\n", + " return\n", + "\n", + " for link in soup.find_all(class_=\"project-icon\", href=True):\n", + " name = re.findall('/projects/([A-Za-z0-9\\-]*)', link.get('href'))\n", + " name = name[0] if name else None\n", + " if name is not None and name.lower().startswith(my_char):\n", + " resp = requests.get(rest + name)\n", + " if resp.status_code == 200:\n", + " info = json.loads(resp.text)\n", + " info['forge'] = 'sourceforge'\n", + " coll.insert_one(info)\n", + " project_count += 1\n", + " if project_count >= 50:\n", + " return\n", + " page += 1\n", + " return\n", + "\n", + "# send queries and extract urls \n", + "def get_gitlab(url, coll):\n", + "\n", + " global gleft\n", + " global header\n", + " global bginnum\n", + " gleft = wait(gleft)\n", + " values = []\n", + " size = 0\n", + " project_count = 0\n", + "\n", + " try:\n", + " r = requests .get(url, headers=header)\n", + " time .sleep(0.5)\n", + " # got blocked\n", + " if r.status_code == 403:\n", + " return \"got blocked\", str(bginnum)\n", + " if (r.ok):\n", + "\n", + " gleft = int(r.headers.get('RateLimit-Remaining'))\n", + " lll = r.headers.get('Link')\n", + " t = r.text\n", + " array = json.loads(t)\n", + " \n", + " for el in array:\n", + " if el['name'].lower().startswith(my_char):\n", + " if project_exists(el['http_url_to_repo']):\n", + " project_count += 1\n", + " el['forge'] = 'gitlab'\n", + " coll.insert_one(el)\n", + " if project_count >= 50:\n", + " return\n", + " \n", + " #next page\n", + " while ('; rel=\"next\"' in lll):\n", + " gleft = int(r.headers.get('RateLimit-Remaining'))\n", + " gleft = wait(gleft)\n", + " # extract next page url\n", + " ll = lll.replace(';', ',').split(',')\n", + " url = ll[ll.index(' rel=\"next\"') -\n", + " 1].replace('<', '').replace('>', '').lstrip()\n", + " \n", + " try:\n", + " r = requests .get(url, headers=header)\n", + " if r.status_code == 403:\n", + " return \"got blocked\", str(bginnum)\n", + " if (r.ok):\n", + " lll = r.headers.get('Link')\n", + " t = r.text\n", + " array1 = json.loads(t)\n", + " for el in array1:\n", + " if el['name'].lower().startswith(my_char):\n", + " if project_exists(el['http_url_to_repo']):\n", + " project_count += 1\n", + " el['forge'] = 'gitlab'\n", + " coll.insert_one(el)\n", + " if project_count >= 50:\n", + " return\n", + " else:\n", + " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", + " return \n", + " except requests.exceptions.ConnectionError:\n", + " sys.stderr.write('could not get ' + url + '\\n')\n", + " \n", + " else:\n", + " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", + " return\n", + "\n", + " except requests.exceptions.ConnectionError:\n", + " sys.stderr.write('could not get ' + url + '\\n')\n", + " except Exception as e:\n", + " sys.stderr.write(url + ';' + str(e) + '\\n')\n", + " \n", + "#start retrieving \n", + "get_gitlab(gitlab_url,coll)\n", + "get_source(source_url, coll, rest_url)\n", + "#print collected data\n", + "for doc in coll.find({}):\n", + " print(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a2e9ae7c0258d8093f70a562bb54d073dc33ddb4 Mon Sep 17 00:00:00 2001 From: Linsey Passarella Date: Mon, 15 Oct 2018 18:35:32 +0000 Subject: [PATCH 2/2] Updated wrong Notebook --- lpassare.ipynb | 85 +++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 64 deletions(-) diff --git a/lpassare.ipynb b/lpassare.ipynb index d3a3149..756de57 100644 --- a/lpassare.ipynb +++ b/lpassare.ipynb @@ -15,27 +15,22 @@ "import time\n", "import datetime\n", "import requests\n", - "from bs4 import BeautifulSoup\n", "\n", "dbname = \"fdac18mp2\" #please use this database\n", - "collname = \"glprj_jdunca51\" #please modify so you store data in your collection\n", - "my_char = 'f'\n", - "\n", + "collname = \"glprj_lpassare\" #please modify so you store data in your collection\n", "# beginning page index\n", - "begin = \"1\"\n", + "begin = \"0\"\n", "client = pymongo.MongoClient()\n", "\n", "db = client[dbname]\n", "coll = db[collname]\n", + "letter=\"b\"\n", "\n", - "\n", - "gitlab_url = \"https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=\" + begin + \\\n", + "beginurl = \"https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=\" + begin + \\\n", " \"&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false\"\n", "\n", - "gleft = 20\n", "\n", - "source_url = \"https://sourceforge.net/directory/?q=\" + my_char + \"&sort=name&page=\"\n", - "rest_url = \"https://sourceforge.net/rest/p/\"\n", + "gleft = 0\n", "\n", "header = {'per_page': 99}\n", "\n", @@ -49,39 +44,8 @@ " time .sleep(60)\n", " return left\n", "\n", - "def project_exists(url):\n", - " r = requests.get(url)\n", - " if r.status_code == 200:\n", - " return True\n", - " return False\n", - "\n", - "def get_source(url, coll, rest):\n", - " page = 1\n", - " project_count = 0\n", - " while True:\n", - " resp = requests.get(url + str(page))\n", - " text = resp.text\n", - " soup = BeautifulSoup(text, 'html.parser')\n", - " if re.search('No results found.', soup.get_text()):\n", - " return\n", - "\n", - " for link in soup.find_all(class_=\"project-icon\", href=True):\n", - " name = re.findall('/projects/([A-Za-z0-9\\-]*)', link.get('href'))\n", - " name = name[0] if name else None\n", - " if name is not None and name.lower().startswith(my_char):\n", - " resp = requests.get(rest + name)\n", - " if resp.status_code == 200:\n", - " info = json.loads(resp.text)\n", - " info['forge'] = 'sourceforge'\n", - " coll.insert_one(info)\n", - " project_count += 1\n", - " if project_count >= 50:\n", - " return\n", - " page += 1\n", - " return\n", - "\n", "# send queries and extract urls \n", - "def get_gitlab(url, coll):\n", + "def get(url, coll):\n", "\n", " global gleft\n", " global header\n", @@ -89,7 +53,6 @@ " gleft = wait(gleft)\n", " values = []\n", " size = 0\n", - " project_count = 0\n", "\n", " try:\n", " r = requests .get(url, headers=header)\n", @@ -105,14 +68,14 @@ " array = json.loads(t)\n", " \n", " for el in array:\n", - " if el['name'].lower().startswith(my_char):\n", - " if project_exists(el['http_url_to_repo']):\n", - " project_count += 1\n", - " el['forge'] = 'gitlab'\n", - " coll.insert_one(el)\n", - " if project_count >= 50:\n", - " return\n", + " if el['name'].lower().startswith(letter):\n", + " el['site'] = \"git\"\n", + " count += 1\n", + " coll.insert_one(el)\n", + " if count > 49:\n", + " return\n", " \n", + " \n", " #next page\n", " while ('; rel=\"next\"' in lll):\n", " gleft = int(r.headers.get('RateLimit-Remaining'))\n", @@ -131,19 +94,13 @@ " t = r.text\n", " array1 = json.loads(t)\n", " for el in array1:\n", - " if el['name'].lower().startswith(my_char):\n", - " if project_exists(el['http_url_to_repo']):\n", - " project_count += 1\n", - " el['forge'] = 'gitlab'\n", - " coll.insert_one(el)\n", - " if project_count >= 50:\n", - " return\n", + " coll.insert(el)\n", " else:\n", " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", " return \n", " except requests.exceptions.ConnectionError:\n", " sys.stderr.write('could not get ' + url + '\\n')\n", - " \n", + "\n", " else:\n", " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", " return\n", @@ -154,11 +111,7 @@ " sys.stderr.write(url + ';' + str(e) + '\\n')\n", " \n", "#start retrieving \n", - "get_gitlab(gitlab_url,coll)\n", - "get_source(source_url, coll, rest_url)\n", - "#print collected data\n", - "for doc in coll.find({}):\n", - " print(doc)" + "get(beginurl,coll)" ] }, { @@ -166,7 +119,11 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import gitlab \n", + "gl = gitlab.Gitlab('http://10.0.0.1')\n", + "gl.search('projects','b',per_page=50)" + ] } ], "metadata": {