Skip to content
This repository was archived by the owner on May 5, 2023. It is now read-only.

Commit 255d966

Browse files
committed
feat: add spider for crawling covidtracking data
1 parent 7a4d800 commit 255d966

File tree

4 files changed

+162
-1
lines changed

4 files changed

+162
-1
lines changed

django_covid19/spider/nCoV/items.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,8 @@ class CountryItem(DjangoItem):
2828

2929
class CityItem(DjangoItem):
3030

31-
django_model = models.City
31+
django_model = models.City
32+
33+
class StateItem(DjangoItem):
34+
35+
django_model = models.State

django_covid19/spider/nCoV/pipelines.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,17 @@
1414

1515
from . import items
1616

17+
class CovidTrackingPipeline(object):
18+
19+
def process_item(self, item, spider):
20+
if isinstance(item, items.StateItem):
21+
state = item['state']
22+
countryShortCode = item['countryShortCode']
23+
items.StateItem.django_model.objects.update_or_create(
24+
state=state, countryShortCode=countryShortCode,
25+
defaults=item)
26+
return item
27+
1728

1829
class NcovPipeline(object):
1930

@@ -46,6 +57,8 @@ def process_item(self, item, spider):
4657
klass = item.__class__
4758
klass.django_model.objects.create(**item)
4859
return item
60+
else:
61+
return item
4962

5063
def close_spider(self, spider):
5164
cache.set('crawled', spider.crawled)

django_covid19/spider/nCoV/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
6767
ITEM_PIPELINES = {
6868
'nCoV.pipelines.NcovPipeline': 300,
69+
'nCoV.pipelines.CovidTrackingPipeline': 400
6970
}
7071

7172
# Enable and configure the AutoThrottle extension (disabled by default)
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author: zhanglei3
3+
# @Date: 2020-04-08 09:08:13
4+
# @Last Modified by: leafcoder
5+
# @Last Modified time: 2020-05-30 19:02:49
6+
7+
"""美国各州疫情数据源"""
8+
9+
import json
10+
import scrapy
11+
import logging
12+
from scrapy.selector import Selector
13+
14+
from .. import items
15+
16+
from django.core.cache import cache
17+
from django.utils.timezone import datetime, make_aware
18+
from django.utils.translation import ugettext_lazy as _
19+
20+
logger = logging.getLogger()
21+
22+
# For state i18n
23+
STATES = {
24+
"Alabama": _("Alabama"),
25+
"Alaska": _("Alaska"),
26+
"AmericanSamoa": _("AmericanSamoa"),
27+
"Arizona": _("Arizona"),
28+
"Arkansas": _("Arkansas"),
29+
"California": _("California"),
30+
"Colorado": _("Colorado"),
31+
"Connecticut": _("Connecticut"),
32+
"Delaware": _("Delaware"),
33+
"DistrictOfColumbia": _("DistrictOfColumbia"),
34+
"Florida": _("Florida"),
35+
"Georgia": _("Georgia"),
36+
"Guam": _("Guam"),
37+
"Hawaii": _("Hawaii"),
38+
"Idaho": _("Idaho"),
39+
"Illinois": _("Illinois"),
40+
"Indiana": _("Indiana"),
41+
"Iowa": _("Iowa"),
42+
"Kansas": _("Kansas"),
43+
"Kentucky": _("Kentucky"),
44+
"Louisiana": _("Louisiana"),
45+
"Maine": _("Maine"),
46+
"Maryland": _("Maryland"),
47+
"Massachusetts": _("Massachusetts"),
48+
"Michigan": _("Michigan"),
49+
"Minnesota": _("Minnesota"),
50+
"Mississippi": _("Mississippi"),
51+
"Missouri": _("Missouri"),
52+
"Montana": _("Montana"),
53+
"Nebraska": _("Nebraska"),
54+
"Nevada": _("Nevada"),
55+
"NewHampshire": _("NewHampshire"),
56+
"NewJersey": _("NewJersey"),
57+
"NewMexico": _("NewMexico"),
58+
"NewYork": _("NewYork"),
59+
"NorthCarolina": _("NorthCarolina"),
60+
"NorthDakota": _("NorthDakota"),
61+
"NorthernMarianaIslands": _("NorthernMarianaIslands"),
62+
"Ohio": _("Ohio"),
63+
"Oklahoma": _("Oklahoma"),
64+
"Oregon": _("Oregon"),
65+
"Pennsylvania": _("Pennsylvania"),
66+
"PuertoRico": _("PuertoRico"),
67+
"RhodeIsland": _("RhodeIsland"),
68+
"SouthCarolina": _("SouthCarolina"),
69+
"SouthDakota": _("SouthDakota"),
70+
"Tennessee": _("Tennessee"),
71+
"Texas": _("Texas"),
72+
"USVirginIslands": _("USVirginIslands"),
73+
"Utah": _("Utah"),
74+
"Vermont": _("Vermont"),
75+
"Virginia": _("Virginia"),
76+
"Washington": _("Washington"),
77+
"WestVirginia": _("WestVirginia"),
78+
"Wisconsin": _("Wisconsin"),
79+
"Wyoming": _("Wyoming")
80+
}
81+
82+
class CovidTrackingSpider(scrapy.Spider):
83+
84+
"""data source: https://covidtracking.com/api"""
85+
86+
name = "covidtracking"
87+
allowed_domains = ["covidtracking.com"]
88+
country_short_code = 'USA'
89+
states = {}
90+
91+
def start_requests(self):
92+
apis = [
93+
'https://covidtracking.com/api/v1/states/current.json',
94+
'https://covidtracking.com/api/v1/states/daily.json',
95+
'https://covidtracking.com/api/v1/states/info.json',
96+
'https://covidtracking.com/api/v1/us/daily.json',
97+
]
98+
yield scrapy.Request(
99+
'https://covidtracking.com/api/v1/states/info.json',
100+
self.parse_info)
101+
102+
def parse_states_current(self, response):
103+
countryShortCode = self.country_short_code
104+
states = self.states
105+
result = json.loads(response.text)
106+
for item in result:
107+
state = item['state']
108+
state_item = states[state]
109+
state_item.update(item)
110+
state_item.pop('grade', None)
111+
state_item.pop('total', None)
112+
state_item['countryShortCode'] = countryShortCode
113+
yield scrapy.Request(
114+
'https://covidtracking.com/api/v1/states/%s/daily.json' \
115+
% state,
116+
self.parse_state_daily,
117+
meta={
118+
'state_item': state_item
119+
})
120+
121+
def parse_state_daily(self, response):
122+
meta = response.meta
123+
state_item = meta['state_item']
124+
state_item['dailyData'] = json.dumps(
125+
json.loads(response.text)[::-1])
126+
yield items.StateItem(**state_item)
127+
128+
def parse_info(self, response):
129+
countryShortCode = self.country_short_code
130+
states = self.states
131+
result = json.loads(response.text)
132+
for item in result:
133+
state = item['state']
134+
stateName = item['name']
135+
stateName = ''.join(stateName.split())
136+
states[state] = {
137+
'state': state,
138+
'countryShortCode': countryShortCode,
139+
'stateName': stateName
140+
}
141+
yield scrapy.Request(
142+
'https://covidtracking.com/api/v1/states/current.json',
143+
self.parse_states_current)

0 commit comments

Comments
 (0)