新增：小舒代理、seo方法代理、yqie代理 (#124)

wc571498244 · web-flow · commit dac05be5db7f · 2021-10-12T21:46:20.000+08:00
diff --git a/proxypool/crawlers/public/seofangfa.py b/proxypool/crawlers/public/seofangfa.py
@@ -0,0 +1,34 @@
+import requests
+from pyquery import PyQuery as pq
+
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+
+requests.packages.urllib3.disable_warnings()
+BASE_URL = "https://proxy.seofangfa.com/"
+MAX_PAGE = 1
+
+
+class SeoFangFaCrawler(BaseCrawler):
+    """
+    seo方法 crawler, https://proxy.seofangfa.com/
+    """
+    urls = ["https://proxy.seofangfa.com/"]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        trs = doc('.table tr:gt(0)').items()
+        for tr in trs:
+            host = tr.find('td:nth-child(1)').text()
+            port = int(tr.find('td:nth-child(2)').text())
+            yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = SeoFangFaCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/xiaoshudaili.py b/proxypool/crawlers/public/xiaoshudaili.py
@@ -0,0 +1,49 @@
+import re
+
+from pyquery import PyQuery as pq
+
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+
+BASE_URL = "http://www.xsdaili.cn/"
+PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html"
+MAX_PAGE = 50
+
+
+class XiaoShuCrawler(BaseCrawler):
+    """
+    小舒代理 crawler, http://www.xsdaili.cn/
+    """
+
+    def __init__(self):
+        html = self.fetch(url=BASE_URL)
+        doc = pq(html)
+        title = doc(".title:eq(0) a").items()
+
+        latest_page = 0
+        for t in title:
+            res = re.search(r"/(\d+)\.html", t.attr("href"))
+            latest_page = int(res.group(1)) if res else 0
+        if latest_page:
+            self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)]
+        else:
+            self.urls = []
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        contents = doc('.cont').text()
+        contents = contents.split("\n")
+        for content in contents:
+            c = content[:content.find("@")]
+            host, port = c.split(":")
+            yield Proxy(host=host, port=int(port))
+
+
+if __name__ == '__main__':
+    crawler = XiaoShuCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/yqie.py b/proxypool/crawlers/public/yqie.py
@@ -0,0 +1,32 @@
+from pyquery import PyQuery as pq
+
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+
+BASE_URL = "http://ip.yqie.com/ipproxy.htm"
+MAX_PAGE = 1
+
+
+class YqIeCrawler(BaseCrawler):
+    """
+    ip yqie crawler, http://ip.yqie.com/ipproxy.htm
+    """
+    urls = [BASE_URL]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        trs = doc('#GridViewOrder tr:gt(0)').items()
+        for tr in trs:
+            host = tr.find('td:nth-child(1)').text()
+            port = int(tr.find('td:nth-child(2)').text())
+            yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = YqIeCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)