python爬虫练习(9)--爬取今日头条街拍美图

文章目录[x]
  1. 0.1:下面看一下具体的函数实现
  2. 0.2:源代码

获取今日头条街拍美图并保存,将信息存储到MongoDB中

前言

目标:爬取今日头条街拍美图

难度:♦♦◊◊◊

系统:deepin 15.11

软件:pycharm,MongoDB

解释器:Python 3.5

没有采用面向对象编程

声明:本文仅供研究交流使用

用到的一些模块

import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
import os
from hashlib import md5
from multiprocessing import Pool
from json.decoder import JSONDecodeError

 

难点就是,网站信息是基于异步加载实现的,所以应该先找到保存图片信息的位置。还有就是爬取十几页之后,可能触发反爬机制

。可以用多个代理(代理池)解决这个。我们随便点开一个,右击网页检查一下,然后选中doc,查看原始请求,发现图片信息是在一个json字符串中的

下面看一下具体的函数实现

请求索引页

这里就是带请求头的时候,多带几个参数,否则无法返回数据。请求信息的话,有些锚点可以去掉
def get_page_index(offset, kwword): # 解析索引页信息
headers = { # 请求头
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36",
"referer": "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D",
"x-requested-with": "XMLHttpRequest",
"cookie": "tt_webid=6804442003394299400; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6804442003394299400; csrftoken=2f4ad337b4c54533911c4fe6eb8add44; ttcid=286904df2ce44a248f08924ea7b1173021; _ga=GA1.2.841103549.1584282704; _gid=GA1.2.651054703.1584282704; s_v_web_id=verify_k7u3dat2_b2Wcvr6o_4QXO_4Ily_9Wrg_9b6fNn1VGYTW; __tasessionId=83485wdw71584340404738; SLARDAR_WEB_ID=13574044-ee9e-41f0-b1ea-2e73b94d3e0d; tt_scid=oRoddyXUCd-Qcay2..gKngwFK8yi-JtHz-a-8ER1NTVpoxH9yG89DkZwrhWArjwI5e87"
}
data = { # 请求信息
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': kwword,
}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data) # 请求网址
# print(url)
try: # 异常捕获
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.status_code)
return response.text
return None
except RequestException:
print("请求索引页出错!!")
return None

解析索引页

这里直接将json字符串转化为Python字符串,然后提取页面URL
def parse_page_index(html): # 解析索引页信息
try:
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass

请求文章页

这里就是需要将索引页解析的URL传入请求
def get_page_detail(url): # 请求文章页
try:
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36",
"cookie": "tt_webid=6804442003394299400; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6804442003394299400; csrftoken=2f4ad337b4c54533911c4fe6eb8add44; ttcid=286904df2ce44a248f08924ea7b1173021; _ga=GA1.2.841103549.1584282704; _gid=GA1.2.651054703.1584282704; s_v_web_id=verify_k7u3dat2_b2Wcvr6o_4QXO_4Ily_9Wrg_9b6fNn1VGYTW; __tasessionId=83485wdw71584340404738; SLARDAR_WEB_ID=13574044-ee9e-41f0-b1ea-2e73b94d3e0d; tt_scid=pU3.TWmXfq.abCdugMNjEhWWDV73ZDzkStpEgXSLq0z7zz2SAjGOtM71Wc7ApoLxe424"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.status_code)
return response.content.decode()
return None
except RequestException:
print("请求详情页出错", url)
return None

下载保存图片

这里我们用了两个函数

 

def download_img(url): # 下载图片
print("正在下载", url)
try:
response = requests.get(url)
if response.status_code == 200:
# print(response.status_code)
save_images(response.content)
return None
except RequestException:
print("请求图片出错!!", url)
return None

def save_images(content): # 保存图片
file_path = '{0}/{1}.{2}'.format('toutiao_image', md5(content).hexdigest(), 'jpg')
if not os.path.exists('./toutiao_image'):
os.system('mkdir toutiao_image')
with open(file_path, 'wb') as f:
f.write(content)
f.close()

 

解析提取图片地址

这里是一个难点,这里的提取用了bs4语法(当然一般是不推荐采取这种方式的,一般用xpath,这里我只是练习),和正则表达式。

这里我们必须将\\用空替换,否则会报错。data = json.loads(result.group(1).replace('\\', ''))。还有就是这里我解析得到的网页地址是有点问题的,需要在后面替换一下,item.get('url').replace('u002F', '/')。

 

def parse_page_detail(html, url): # 解析网页提取图片
soup = BeautifulSoup(html, 'lxml')
pattern = re.compile('gallery: JSON.parse\(\"(.*?)\"\),', re.S)
result = re.search(pattern, html)
if result:
# print(json.loads(result.group(1)))
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url').replace('u002F', '/') for item in sub_images]
for image in images:
download_img(image)
return {
'title':title,
'url': url,
'images': images
}

保存到MongoDB

client = MongoClient(host="127.0.0.1", port=27017, connect=False)
db = client["jinritoutiao"]["toutiao"]
# 函数
def save_to_mongo(result):
if result:
db.insert(result)
print("存储到mongoDB成功!!")
return True
return False

主函数

这里我们可以采用多进程提高效率
def main(offset):
html = get_page_index(offset, '街拍')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
if result: save_to_mongo(result)

源代码

 

# ------------
# coding=utf-8
# Author:lcx
# software:pycharm
# ------------
import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
import os
from hashlib import md5
from multiprocessing import Pool
from json.decoder import JSONDecodeError

client = MongoClient(host="127.0.0.1", port=27017, connect=False)
db = client["jinritoutiao"]["toutiao"]

def get_page_index(offset, kwword): # 解析索引页信息
headers = { # 请求头
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36",
"referer": "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D",
"x-requested-with": "XMLHttpRequest",
"cookie": "tt_webid=6804442003394299400; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6804442003394299400; csrftoken=2f4ad337b4c54533911c4fe6eb8add44; ttcid=286904df2ce44a248f08924ea7b1173021; _ga=GA1.2.841103549.1584282704; _gid=GA1.2.651054703.1584282704; s_v_web_id=verify_k7u3dat2_b2Wcvr6o_4QXO_4Ily_9Wrg_9b6fNn1VGYTW; __tasessionId=83485wdw71584340404738; SLARDAR_WEB_ID=13574044-ee9e-41f0-b1ea-2e73b94d3e0d; tt_scid=oRoddyXUCd-Qcay2..gKngwFK8yi-JtHz-a-8ER1NTVpoxH9yG89DkZwrhWArjwI5e87"
}
data = { # 请求信息
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': kwword,
}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data) # 请求网址
# print(url)
try: # 异常捕获
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求索引页出错!!")
return None

def parse_page_index(html): # 解析索引页信息
try:
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass

def get_page_detail(url): # 请求文章页
try:
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36",
"cookie": "tt_webid=6804442003394299400; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6804442003394299400; csrftoken=2f4ad337b4c54533911c4fe6eb8add44; ttcid=286904df2ce44a248f08924ea7b1173021; _ga=GA1.2.841103549.1584282704; _gid=GA1.2.651054703.1584282704; s_v_web_id=verify_k7u3dat2_b2Wcvr6o_4QXO_4Ily_9Wrg_9b6fNn1VGYTW; __tasessionId=83485wdw71584340404738; SLARDAR_WEB_ID=13574044-ee9e-41f0-b1ea-2e73b94d3e0d; tt_scid=pU3.TWmXfq.abCdugMNjEhWWDV73ZDzkStpEgXSLq0z7zz2SAjGOtM71Wc7ApoLxe424"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.status_code)
return response.content.decode()
return None
except RequestException:
print("请求详情页出错", url)
return None

def parse_page_detail(html, url): # 解析网页提取图片
soup = BeautifulSoup(html, 'lxml')
# 利用正则提取图片地址
pattern = re.compile('gallery: JSON.parse\(\"(.*?)\"\),', re.S)
result = re.search(pattern, html)
if result:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url').replace('u002F', '/') for item in sub_images]
for image in images:
download_img(image)
return {
'title':title,
'url': url,
'images': images
}

def save_to_mongo(result):
if result:
db.insert(result)
print("存储到mongoDB成功!!")
return True
return False

def download_img(url): # 下载图片
print("正在下载", url)
try:
response = requests.get(url)
if response.status_code == 200:
save_images(response.content)
return None
except RequestException:
print("请求图片出错!!", url)
return None

def save_images(content): # 保存图片
file_path = '{0}/{1}.{2}'.format('toutiao_image', md5(content).hexdigest(), 'jpg')
if not os.path.exists('./toutiao_image'):
os.system('mkdir toutiao_image')
with open(file_path, 'wb') as f:
f.write(content)
f.close()

def main(offset):
html = get_page_index(offset, '街拍')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
if result: save_to_mongo(result)

if __name__ == '__main__':
groups = [x * 20 for x in range(21, 41)]
pool = Pool()
pool.map(main, groups)

运行结果:

保存到MongoDB的数据

保存的图片

当然这里面还有很多的问题需要改进~
点赞

发表评论

昵称和uid可以选填一个,填邮箱必填(留言回复后将会发邮件给你)
tips:输入uid可以快速获得你的昵称和头像

Title - Artist
0:00