88
99import json
1010import scrapy
11+ import logging
1112from scrapy .selector import Selector
1213from .. import items
1314
1415from django .utils .timezone import datetime , make_aware
1516
17+ logger = logging .getLogger ()
18+
1619class DXYSpider (scrapy .Spider ):
1720
1821 name = "dxy"
@@ -25,6 +28,27 @@ def parse(self, response):
2528 sel = Selector (response )
2629 scripts = sel .xpath ('//script' )
2730
31+ # 判断是否需要保存抓取的数据
32+ statistics = self .get_dict (scripts , '#getStatisticsService' )
33+ createTime = make_aware (
34+ datetime .fromtimestamp (statistics ['createTime' ] / 1000.0 ))
35+ modifyTime = make_aware (
36+ datetime .fromtimestamp (statistics ['modifyTime' ] / 1000.0 ))
37+ prev_crawler = items .CrawlerItem .django_model .objects .all ().order_by ('-id' )[1 ]
38+ if prev_crawler .modifyTime == modifyTime :
39+ logger .info ('Data does not change.' )
40+ self .crawler .delete ()
41+ self .crawler = None
42+ return
43+ self .crawler .createTime = createTime
44+ self .crawler .modifyTime = modifyTime
45+ self .crawler .save ()
46+
47+ # 统计信息
48+ statistics = self .parse_statistics (statistics )
49+ for item in statistics :
50+ yield item
51+
2852 # 国内数据
2953 provinces = self .get_list (scripts , '#getAreaStat' )
3054 for province in provinces :
@@ -48,11 +72,6 @@ def parse(self, response):
4872 country .pop ('provinceShortName' )
4973 yield items .CountryItem (** country )
5074
51- # 统计信息
52- statistics = self .get_statistics (scripts , '#getStatisticsService' )
53- for item in statistics :
54- yield item
55-
5675 # 时间线事件,id=“getTimelineService2” 为英文内容
5776 timelines = self .get_list (scripts , '#getTimelineService1' )
5877 for item in timelines :
@@ -106,8 +125,7 @@ def parse(self, response):
106125 rumor [key ] = item .get (key )
107126 yield items .RumorItem (** rumor )
108127
109- def get_statistics (self , scripts , data_id ):
110- data = self .get_dict (scripts , data_id )
128+ def parse_statistics (self , data ):
111129 statistics = data ['globalStatistics' ]
112130 item = {}
113131 for key in (
@@ -156,16 +174,10 @@ def get_statistics(self, scripts, data_id):
156174 }
157175 yield items .NoticeItem (** item )
158176
159- self .crawler .createTime = make_aware (
160- datetime .fromtimestamp (data ['createTime' ] / 1000.0 ))
161- self .crawler .modifyTime = make_aware (
162- datetime .fromtimestamp (data ['modifyTime' ] / 1000.0 ))
163- self .crawler .save ()
164-
165177 def get_list (self , scripts , data_id ):
166178 ret = scripts .css (data_id ).re (r'(\[.+\])' )
167179 return json .loads (ret [0 ])
168180
169181 def get_dict (self , scripts , data_id ):
170182 ret = scripts .css (data_id ).re (r'\=\s*(\{.+\})\}catch\(e\)\{\}' )
171- return json .loads (ret [0 ])
183+ return json .loads (ret [0 ])
0 commit comments