python爬虫笔记
最近有个需求,项目组长安排我弄两份国家行政区划的数据文件,excel或者sql文件都行,一份中国的一份南非的,这可把我给整蒙了,中国的还好,爬一下国家统计局行政区划2019年的官网就好了,南非的从哪整去。
话不多说,找了很多资料,发现要么不全,要么不是最新,他们的数据肯定也是自己弄来的,我为啥就不能自己也爬一波,说干就干,先来爬中国行政区划统计数据:
首先是国家统计局官网:
我们要爬取的地址就是介个,
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/
我们先琢磨下这个网站是怎么布局的:
通过分析首页源码我们可以得到如下3点:
页面的整个布局是用的table标签来控制的,也就是说我们如果要通过beautifulsoup来抓取信息,那么一定要注意,上图中不是只要标注了省市地区的地方采用的才是表格,整个页面中存在多个表格,因此是不可以直接通过表格
print('开始抓取省份信息……')
province_url = url + index_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(province_url, headers=headers)
request.encoding = 'gbk'
province_html_text = str(request.text)
soup = BeautifulSoup(province_html_text, "html.parser")
province_tr_list = soup.select('.provincetr a')
再次我们再看一下一般的数据页面(一般的数据页面包括市级、县级、镇级这三级数据展示页面):
之所以要把上述三个页面放在一起,是因为通过分析我们可以发现,这三级数据的数据页面完全一致,唯一不同的就是在html源码数据表格中的数据行tr的class属性不一致,分别对应为:citytr,countrytrhe towntr。其他均一致。这样我们就可以用一个通用的方法解决这三个页面的数据爬取.
遍历市级别的信息:
city_url = url + province_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(city_url, headers=headers)
request.encoding = 'gbk'
city_html_text = str(request.text)
soup = BeautifulSoup(city_html_text, "html.parser")
city_tr_list = soup.select('.citytr')
# 遍历市级城市列表信息
然后遍历区级别的信息:
for city_tr in city_tr_list:
if city_tr:
file = open('mysql_v2/area.sql', 'a+', encoding='utf-8')
city_a_info = city_tr.select('a')
city_href = city_a_info[0].attrs['href']
city_code = city_a_info[0].text[:6]
city_name = city_a_info[1].text
city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
file.write(city_info)
file.close()
print('已写入市级:', city_info)
# 区级
get_area(city_href, city_code)
数据保存为sql格式,就只需要导入数据库就OK了~
province_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(province_code) + '", "' + str(province_name) + '", "' + str(parent_code) + '", "' + str(level) + '");\n'
city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
area_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(area_code) + '", "' + str(area_name) + '", "' + str(city_code) + '", "' + str(level) + '");\n'
最后附上完整的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
爬取国家统计局最新地址库
省市区三级(一张表)
author: icefire
time: 2019-03-13
"""
import requests
from bs4 import BeautifulSoup
import os
def get_province(index_href):
"""抓取省份信息"""
print('开始抓取省份信息……')
province_url = url + index_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(province_url, headers=headers)
request.encoding = 'gbk'
province_html_text = str(request.text)
soup = BeautifulSoup(province_html_text, "html.parser")
province_tr_list = soup.select('.provincetr a')
# 遍历省份列表信息
level = '1'
parent_code = ''
for province_tr in province_tr_list:
if province_tr:
file = open('china_data/area.sql', 'a+', encoding='utf-8')
province_href = province_tr.attrs['href']
province_no = province_href.split('.')[0]
province_code = province_no + '0000'
province_name = province_tr.text
province_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(province_code) + '", "' + str(province_name) + '", "' + str(parent_code) + '", "' + str(level) + '");\n'
file.write(province_info)
file.close()
print('已写入省级:', province_info)
# 市级
get_city(province_href, province_code)
print('抓取省份信息结束!')
def get_city(province_href, province_code):
"""抓取市级城市信息"""
print('开始抓取市级信息')
city_url = url + province_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(city_url, headers=headers)
request.encoding = 'gbk'
city_html_text = str(request.text)
soup = BeautifulSoup(city_html_text, "html.parser")
city_tr_list = soup.select('.citytr')
# 遍历市级城市列表信息
level = '2'
for city_tr in city_tr_list:
if city_tr:
file = open('china_data/area.sql', 'a+', encoding='utf-8')
city_a_info = city_tr.select('a')
city_href = city_a_info[0].attrs['href']
city_code = city_a_info[0].text[:6]
city_name = city_a_info[1].text
city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
file.write(city_info)
file.close()
print('已写入市级:', city_info)
# 区级
get_area(city_href, city_code)
print('抓取市级城市结束!')
def get_area(city_href, city_code):
"""抓取区级信息"""
print('开始抓取区级信息')
area_url = url + city_href
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = requests.get(area_url, headers=headers)
request.encoding = 'gbk'
area_html_text = str(request.text)
soup = BeautifulSoup(area_html_text, "html.parser")
area_tr_list = soup.select('.countytr')
# 遍历区级列表信息
file = open('china_data/area.sql', 'a+', encoding='utf-8')
level = '3'
for area_tr in area_tr_list:
area_a_info = area_tr.select('td')
if area_a_info:
area_code = area_a_info[0].text[:6]
area_name = area_a_info[1].text
area_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(area_code) + '", "' + str(area_name) + '", "' + str(city_code) + '", "' + str(level) + '");\n'
file.write(area_info)
print('已写入区级:', area_info)
print('抓取区级信息结束!')
file.close()
# 程序主入口
if __name__ == "__main__":
url = 'https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'
# 创建json目录
mysql_folder = 'china_data/'
if not os.path.exists(mysql_folder):
os.makedirs(mysql_folder)
else:
# 清空城市和地区
city_file = open('china_data/area.sql', 'w', encoding='utf-8')
city_file.write('')
city_file.close()
get_province('index.html')
OK,搞定,至于南非的行政区划,再琢磨琢磨,之后找到解决方案了再补上。
使用豆瓣源 安装 插件库
pip install requests -i "https://pypi.doubanio.com/simple/"
爬取网易云音乐歌曲评论
实现有点曲折,直接贴代码实现了,代码中配有注释
#v2.0
import base64
import codecs
import sys
import json
import os
import requests
import Crypto
from Crypto.Cipher import AES
class Spider():
def __init__(self,idNum):
#user-Agent字段直接从浏览器中复制过来即可,请求头中其他字段非必须项,也可以从浏览器中找到所有字段都放到Request Headers
self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Referer': 'http://music.163.com/'}
self.url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_'+idNum+'?csrf_token=' #每一次的base_url只有歌曲id不同,构造url即可。
def __get_jsons(self,url,page):
# 获取两个参数
music = WangYiYun()
text = music.create_random_16()
params = music.get_params(text,page)
encSecKey = music.get_encSEcKey(text)
fromdata = {'params' : params,'encSecKey' : encSecKey}
jsons = requests.post(url, data=fromdata, headers=self.header)
#print(jsons.raise_for_status())
# 打印返回来的内容,是个json格式的
#print(jsons.content)
return jsons.text
def json2list(self,jsons):
'''把json转成字典,并把他重要的信息获取出来存入列表'''
# 可以用json.loads()把它转成字典
#print(json.loads(jsons.text))
users = json.loads(jsons)
comments = []
#print (users)
for user in users['comments']:
name = user['user']['nickname']
content = user['content']
times=user['time']
userid = user['user']['userId']
# 点赞数
likedCount = user['likedCount']
#提取所需json中所需的字段构造字典
user_dict = {'userid':userid,'name': name, 'content': content, 'times':times}
#将提取的字典信息追加到列表中
#print(user_dict)
comments.append(user_dict)
#comments.append(idNum)
# print(comments)
return comments
def run(self,idNum):
self.page = 1
while True:
jsons = self.__get_jsons(self.url,self.page)
comments = self.json2list(jsons)
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
## print(str(comments[0]).translate(non_bmp_map))
print('self.page = '+str(self.page)) #控制台打印正在爬取的页码数
print(idNum) #打印正在爬取的歌曲id
#在该脚本同级目录下生成“comments”文件夹
dirName = u'{}'.format('comments')
if not os.path.exists(dirName):
os.makedirs(dirName)
with open(".\comments\\"+idNum+".csv","a",encoding='utf-8') as f: #结果写入txt文件
## print(len(comments))
for ii in range(len(comments)):
f.write(str(comments[ii]).translate(non_bmp_map))
f.write('\n')
f.write(idNum)
## print(ii)
f.close()
# 当这一页的评论数少于20条时,证明已经获取完
## self.write2sql(comments)
if len(comments) < 100 : #当limits设置为100时,默认每次服务器请求结果100条comments,当小于此数,意味爬到最后一页。
print('评论已经获取完')
break
self.page +=1
# 找出post的两个参数params和encSecKey
class WangYiYun():
def __init__(self):
# 在网易云获取的三个参数
self.second_param = '010001'
self.third_param = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
self.fourth_param = '0CoJUm6Qyw8W8jud'
def create_random_16(self):
'''获取随机十六个字母拼接成的字符串'''
return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(16)))))[0:16]
def aesEncrypt(self, text, key):
# 偏移量
iv = '0102030405060708'
# 文本
pad = 16 - len(text) % 16
text = text + pad * chr(pad) #补齐文本长度
encryptor = AES.new(bytearray(key,'utf-8'), AES.MODE_CBC, bytearray(iv,'utf-8'))
# encryptor = AES.new(key, 2, iv)
ciphertext = encryptor.encrypt(bytearray(text,'utf-8'))
## print(bytearray(key,'utf-8'))
ciphertext = base64.b64encode(ciphertext)
return ciphertext
def get_params(self,text,page):
'''获取网易云第一个参数'''
# 第一个参数
if page == 1:
self.first_param = '{rid: "", offset: "0", total: "true", limit: "100", csrf_token: ""}'
#rid: "R_SO_4_557581284",经测试该值可以置空,不影响结果的执行。
else:
self.first_param = '{rid: "", offset:%s, total: "false", limit: "100", csrf_token: ""}'%str((page-1)*20) #limit参数可以灵活设置,默认为20,设置为100,爬取效率可以提高
params = self.aesEncrypt(self.first_param, self.fourth_param).decode('utf-8')
params = self.aesEncrypt(params, text)
return params
def rsaEncrypt(self, pubKey, text, modulus):
'''进行rsa加密'''
text = text[::-1]
rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(pubKey, 16) % int(modulus, 16)
return format(rs, 'x').zfill(256)
def get_encSEcKey(self,text):
'''获取第二个参数'''
pubKey = self.second_param
moudulus = self.third_param
encSecKey = self.rsaEncrypt(pubKey, text, moudulus)
return encSecKey
def main():
idPs = ['557581284','32019002'] #花粥《纸短情长》以及Zedd / Jon Bellion的《beautiful now》,可根据需要在网易云音乐查找歌曲ID后替换,列表元素越多,爬取的循环次数越多
for jj in range(len(idPs)):
idNum = idPs[jj]
spider = Spider(idNum) #根据Spider类实例化spider对象
spider.run(idNum) #调用spider对象的run方法
if __name__ == '__main__':
main()