File tree Expand file tree Collapse file tree 3 files changed +115
-0
lines changed
proxypool/crawlers/public Expand file tree Collapse file tree 3 files changed +115
-0
lines changed Original file line number Diff line number Diff line change 1+ import requests
2+ from pyquery import PyQuery as pq
3+
4+ from proxypool .schemas .proxy import Proxy
5+ from proxypool .crawlers .base import BaseCrawler
6+
7+ requests .packages .urllib3 .disable_warnings ()
8+ BASE_URL = "https://proxy.seofangfa.com/"
9+ MAX_PAGE = 1
10+
11+
12+ class SeoFangFaCrawler (BaseCrawler ):
13+ """
14+ seo方法 crawler, https://proxy.seofangfa.com/
15+ """
16+ urls = ["https://proxy.seofangfa.com/" ]
17+
18+ def parse (self , html ):
19+ """
20+ parse html file to get proxies
21+ :return:
22+ """
23+ doc = pq (html )
24+ trs = doc ('.table tr:gt(0)' ).items ()
25+ for tr in trs :
26+ host = tr .find ('td:nth-child(1)' ).text ()
27+ port = int (tr .find ('td:nth-child(2)' ).text ())
28+ yield Proxy (host = host , port = port )
29+
30+
31+ if __name__ == '__main__' :
32+ crawler = SeoFangFaCrawler ()
33+ for proxy in crawler .crawl ():
34+ print (proxy )
Original file line number Diff line number Diff line change 1+ import re
2+
3+ from pyquery import PyQuery as pq
4+
5+ from proxypool .schemas .proxy import Proxy
6+ from proxypool .crawlers .base import BaseCrawler
7+
8+ BASE_URL = "http://www.xsdaili.cn/"
9+ PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html"
10+ MAX_PAGE = 50
11+
12+
13+ class XiaoShuCrawler (BaseCrawler ):
14+ """
15+ 小舒代理 crawler, http://www.xsdaili.cn/
16+ """
17+
18+ def __init__ (self ):
19+ html = self .fetch (url = BASE_URL )
20+ doc = pq (html )
21+ title = doc (".title:eq(0) a" ).items ()
22+
23+ latest_page = 0
24+ for t in title :
25+ res = re .search (r"/(\d+)\.html" , t .attr ("href" ))
26+ latest_page = int (res .group (1 )) if res else 0
27+ if latest_page :
28+ self .urls = [PAGE_BASE_URL .format (page = page ) for page in range (latest_page - MAX_PAGE , latest_page )]
29+ else :
30+ self .urls = []
31+
32+ def parse (self , html ):
33+ """
34+ parse html file to get proxies
35+ :return:
36+ """
37+ doc = pq (html )
38+ contents = doc ('.cont' ).text ()
39+ contents = contents .split ("\n " )
40+ for content in contents :
41+ c = content [:content .find ("@" )]
42+ host , port = c .split (":" )
43+ yield Proxy (host = host , port = int (port ))
44+
45+
46+ if __name__ == '__main__' :
47+ crawler = XiaoShuCrawler ()
48+ for proxy in crawler .crawl ():
49+ print (proxy )
Original file line number Diff line number Diff line change 1+ from pyquery import PyQuery as pq
2+
3+ from proxypool .schemas .proxy import Proxy
4+ from proxypool .crawlers .base import BaseCrawler
5+
6+ BASE_URL = "http://ip.yqie.com/ipproxy.htm"
7+ MAX_PAGE = 1
8+
9+
10+ class YqIeCrawler (BaseCrawler ):
11+ """
12+ ip yqie crawler, http://ip.yqie.com/ipproxy.htm
13+ """
14+ urls = [BASE_URL ]
15+
16+ def parse (self , html ):
17+ """
18+ parse html file to get proxies
19+ :return:
20+ """
21+ doc = pq (html )
22+ trs = doc ('#GridViewOrder tr:gt(0)' ).items ()
23+ for tr in trs :
24+ host = tr .find ('td:nth-child(1)' ).text ()
25+ port = int (tr .find ('td:nth-child(2)' ).text ())
26+ yield Proxy (host = host , port = port )
27+
28+
29+ if __name__ == '__main__' :
30+ crawler = YqIeCrawler ()
31+ for proxy in crawler .crawl ():
32+ print (proxy )
You can’t perform that action at this time.
0 commit comments