scrapy genspider stats stats.gov.cn
settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'myspider.pipelines.StatsPipeline': 300,
}
spider
import scrapy
from myspider.items import StatsItem
class StatsSpider(scrapy.Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html']
def parse(self, response):
print(response.xpath("/html/head/title/text()").extract_first())
provinces = response.xpath("//tr[@class='provincetr']")
# print("provinces:", len(provinces))
for province in provinces:
province_name = province.xpath('td/a/text()').extract_first()
city_href = province.xpath('td/a/@href').extract_first()
city_url = response.urljoin(city_href)
yield scrapy.Request(url=city_url, callback=self.parse_city, meta={'province_name': province_name})
break
def parse_city(self, response):
citys = response.xpath("//tr[@class='citytr']")
for city in citys:
city_ = city.xpath('td/a/text()').extract()
ad_href = city.xpath('td/a/@href').extract_first()
if city_:
city_code = city_[0]
city_name = city_[1]
ad_url = response.urljoin(ad_href)
# print(city_code, city_name, ad_url)
yield scrapy.Request(url=ad_url, callback=self.parse_ad,
meta={'city_code': city_code, 'city_name': city_name,
'province_name': response.meta['province_name']})
break
def parse_ad(self, response):
ads = response.xpath("//tr[@class='countytr']")
for ad in ads:
ad_ = ad.xpath('td/a/text()').extract()
town_href = ad.xpath('td/a/@href').extract_first()
if ad_:
ad_code = ad_[0]
ad_name = ad_[1]
town_url = response.urljoin(town_href)
# print(response.meta['province_name'], response.meta['city_name'], response.meta['city_code'],
# ad_name, ad_code, town_url)
yield scrapy.Request(url=town_url, callback=self.parse_town,
meta={'province_name': response.meta['province_name'],
'city_code': response.meta['city_code'],
'city_name': response.meta['city_name'],
'ad_code': ad_code,
'ad_name': ad_name
})
def parse_town(self, response):
towns = response.xpath("//tr[@class='towntr']")
for town in towns:
town_ = town.xpath('td/a/text()').extract()
community_href = town.xpath('td/a/@href').extract_first()
if town_:
town_code = town_[0]
town_name = town_[1]
community_url = response.urljoin(community_href)
# print(response.meta['province_name'], response.meta['city_name'], response.meta['city_code'],
# response.meta['ad_name'], response.meta['ad_code'],
# town_code, town_name, community_url)
yield scrapy.Request(url=community_url, callback=self.parse_community,
meta={'province_name': response.meta['province_name'],
'city_code': response.meta['city_code'],
'city_name': response.meta['city_name'],
'ad_code': response.meta['ad_code'],
'ad_name': response.meta['ad_name'],
"town_code": town_code,
"town_name": town_name
})
def parse_community(self, response):
communitys = response.xpath("//tr[@class='villagetr']")
for community in communitys:
community_ = community.xpath('td/text()').extract()
if community_:
community_code = community_[0]
community_kind = community_[1]
community_name = community_[2]
print(response.meta['province_name'], response.meta['city_code'], response.meta['city_name'],
response.meta['ad_code'], response.meta['ad_name'],
response.meta['town_code'], response.meta['town_name'],
community_code, community_kind, community_name)
statsItem = StatsItem()
statsItem['prov_name'] = response.meta['province_name']
statsItem['city_code'] = response.meta['city_code']
statsItem['city_name'] = response.meta['city_name']
statsItem['ad_code'] = response.meta['ad_code']
statsItem['ad_name'] = response.meta['ad_name']
statsItem['town_code'] = response.meta['town_code']
statsItem['town_name'] = response.meta['town_name']
statsItem['community_code'] = community_code
statsItem['community_kind'] = community_kind
statsItem['community_name'] = community_name
yield statsItem
pipelines.py
import json
class MyspiderPipeline:
def process_item(self, item, spider):
return item
class StatsPipeline:
def __init__(self):
self.file = open("a.json", 'w')
def process_item(self, item, spider):
print('StatsPipeline:', item)
self.file.write(json.dumps(dict(item), ensure_ascii=False))
self.file.write("\n")
def __del__(self):
self.file.close()
items.py
import scrapy
class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class StatsItem(scrapy.Item):
# define the fields for your item here like:
prov_name = scrapy.Field()
city_code = scrapy.Field()
city_name = scrapy.Field()
ad_code = scrapy.Field()
ad_name = scrapy.Field()
town_code = scrapy.Field()
town_name = scrapy.Field()
community_code = scrapy.Field()
community_kind = scrapy.Field()
community_name = scrapy.Field()