Skip to content
This repository was archived by the owner on May 5, 2023. It is now read-only.

Commit 81995ad

Browse files
committed
Merge branch 'develop'
2 parents be16a2a + 4baf286 commit 81995ad

File tree

3 files changed

+30
-16
lines changed

3 files changed

+30
-16
lines changed

covid19/settings.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,5 +156,6 @@
156156
# Setting of Crontab
157157
CRONJOBS = (
158158
# 每分钟抓取一次
159-
('*/1 * * * *', 'ncovapi.cron.crawl_dxy', [], {}, '>> %s/crontab.log' % BASE_DIR),
159+
('*/1 * * * *', 'ncovapi.cron.crawl_dxy', [], {}, '>> %s/var/logs/crontab.log' % BASE_DIR),
160160
)
161+

spider/nCoV/pipelines.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,5 @@ def process_item(self, item, spider):
4242
return item
4343

4444
def close_spider(self, spider):
45-
cache.set('crawled', 1)
45+
if spider.crawler is not None:
46+
cache.set('crawled', 1)

spider/nCoV/spiders/dxy.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@
88

99
import json
1010
import scrapy
11+
import logging
1112
from scrapy.selector import Selector
1213
from .. import items
1314

1415
from django.utils.timezone import datetime, make_aware
1516

17+
logger = logging.getLogger()
18+
1619
class DXYSpider(scrapy.Spider):
1720

1821
name = "dxy"
@@ -25,6 +28,27 @@ def parse(self, response):
2528
sel = Selector(response)
2629
scripts = sel.xpath('//script')
2730

31+
# 判断是否需要保存抓取的数据
32+
statistics = self.get_dict(scripts, '#getStatisticsService')
33+
createTime = make_aware(
34+
datetime.fromtimestamp(statistics['createTime'] / 1000.0))
35+
modifyTime = make_aware(
36+
datetime.fromtimestamp(statistics['modifyTime'] / 1000.0))
37+
prev_crawler = items.CrawlerItem.django_model.objects.all().order_by('-id')[1]
38+
if prev_crawler.modifyTime == modifyTime:
39+
logger.info('Data does not change.')
40+
self.crawler.delete()
41+
self.crawler = None
42+
return
43+
self.crawler.createTime = createTime
44+
self.crawler.modifyTime = modifyTime
45+
self.crawler.save()
46+
47+
# 统计信息
48+
statistics = self.parse_statistics(statistics)
49+
for item in statistics:
50+
yield item
51+
2852
# 国内数据
2953
provinces = self.get_list(scripts, '#getAreaStat')
3054
for province in provinces:
@@ -48,11 +72,6 @@ def parse(self, response):
4872
country.pop('provinceShortName')
4973
yield items.CountryItem(**country)
5074

51-
# 统计信息
52-
statistics = self.get_statistics(scripts, '#getStatisticsService')
53-
for item in statistics:
54-
yield item
55-
5675
# 时间线事件,id=“getTimelineService2” 为英文内容
5776
timelines = self.get_list(scripts, '#getTimelineService1')
5877
for item in timelines:
@@ -106,8 +125,7 @@ def parse(self, response):
106125
rumor[key] = item.get(key)
107126
yield items.RumorItem(**rumor)
108127

109-
def get_statistics(self, scripts, data_id):
110-
data = self.get_dict(scripts, data_id)
128+
def parse_statistics(self, data):
111129
statistics = data['globalStatistics']
112130
item = {}
113131
for key in (
@@ -156,16 +174,10 @@ def get_statistics(self, scripts, data_id):
156174
}
157175
yield items.NoticeItem(**item)
158176

159-
self.crawler.createTime = make_aware(
160-
datetime.fromtimestamp(data['createTime'] / 1000.0))
161-
self.crawler.modifyTime = make_aware(
162-
datetime.fromtimestamp(data['modifyTime'] / 1000.0))
163-
self.crawler.save()
164-
165177
def get_list(self, scripts, data_id):
166178
ret = scripts.css(data_id).re(r'(\[.+\])')
167179
return json.loads(ret[0])
168180

169181
def get_dict(self, scripts, data_id):
170182
ret = scripts.css(data_id).re(r'\=\s*(\{.+\})\}catch\(e\)\{\}')
171-
return json.loads(ret[0])
183+
return json.loads(ret[0])

0 commit comments

Comments
 (0)