python爬虫练习(8)--爬取猫眼电影Top100

文章目录[x]
  1. 1:发起请求的函数:
  2. 1.1:解析网页内容的函数:
  3. 1.2:图片保存函数:
  4. 1.3:电影详细信息保存函数:
  5. 1.4:主函数:
  6. 1.5:源程序:

爬取猫眼电影排行榜TOP100,并保存相应的封面图

前言

难度:♦♦◊◊◊

操作系统:deepin 15.11

software:Pycharm

目标:爬取猫眼电影top100

声明:本文章仅供研究交流使用

并没有使用类封装

用到的库

import csv
import json
import requests
from requests.exceptions import RequestException
import re
from lxml import etree
import os

 

发起请求的函数:

这里用到了requests库中的异常捕获,如果状态码不是200返回None
def get_one_page(url, headers):  # 发起请求函数
    try:
        response = requests.get(url, headers=headers)
        # print(response.encoding)
        print("******") # 打印一次表示一页
        if response.status_code == 200:
            return response.content.decode()
        return None
    except RequestException:  # 异常捕获
        return None

 

解析网页内容的函数:

这里用的是正则表达式提取数据根据源代码写表达式

def parse_one_poage(html): # 解析网页内容
pattern = re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a' +
'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
item = re.findall(pattern, html)

for item in item:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'author': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5] + item[6],

}

图片保存函数:

这里先用xpath语法将一页中的所有封面图片URL地址提取,注意这里的for循环用了两个变量,所以后面的两个范围需要用zip()括起来。否则会报错,还有就是会检测一下当前文件夹下是否有top100_maoyan_img文件夹,如果没有,会自己创建
def save_img(html, j): # 图片保存
    tree = etree.HTML(html)
    img = tree.xpath('//img[@class="board-img"]/@data-src')
    for item, i in zip(img, range(j + 1, j + 11)):
        response = requests.get(item)
        dir_name = 'top100_maoyan_img'
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        poster_path = dir_name + '/' + str(i) + '.jpg'
        with open(poster_path, "wb")as f:
            f.write(response.content)
            print("正在保存第%d张图片" % i) # 打印正在保存第几张图片

电影详细信息保存函数:

将数据保存为CSV文件

 

def save_results(result):  # 电影信息保存到csv文件
    with open('sgyz_maoyan.csv', 'a') as fp:
        writer = csv.DictWriter(fp, fieldnames=['index', 'title', 'score', 'author', 'time', 'image'])
        writer.writerow(result)

主函数:

注意这里的请求头,我试了一下,如果只带cookie和user-agent无法得到数据,所以应该多带几个参数
def main(i, num): # 主函数
headers = { # 请求头
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Cookie': '__mta=213153707.1584169820671.1584176850800.1584176888803.20; uuid_n_v=v1; uuid=DCA0118065C211EABFD4D144263730BB3384A5D996F4486FBD44DA930515B024; _csrf=486e9298701b5fcff171b29ffe8e7f8c022d3f3f6ccb5acca09b7b7bfbd8b34a; _lxsdk_cuid=170d7e11ea7c8-07bfb3c85ab355-6313f69-144000-170d7e11ea7c8; mojo-uuid=04a41724b07cf2e52447711156b8dea1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1584169820,1584169847; mojo-session-id={"id":"87c2c9fdc02b4d0f3196f1c7a8726afe","time":1584174922356}; _lxsdk=DCA0118065C211EABFD4D144263730BB3384A5D996F4486FBD44DA930515B024; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1584177991; __mta=213153707.1584169820671.1584176888803.1584177990939.21; mojo-trace-id=20; _lxsdk_s=170d8277745-ba6-3a2-65%7C%7C79'
, 'Host': 'maoyan.com'
, 'Pragma': 'no-cache'
, 'Sec-Fetch-Dest': 'document'
, 'Sec-Fetch-Mode': 'navigate'
, 'Sec-Fetch-Site': 'none'
, 'Sec-Fetch-User': '?1'
, 'Upgrade-Insecure-Requests': '1'
}

url = "https://maoyan.com/board/4?offset=" + str(i)
html = get_one_page(url, headers)
save_img(html, num)

for item in parse_one_poage(html):
save_results(item)

num += 10;
return num

源程序:

# ------------------
# coding=utf-8
# @ author:lcx
# 爬取猫眼电影top100
# @software: pycharm
# -------------------
import csv
import json
import requests
from requests.exceptions import RequestException
import re
from lxml import etree
import os

def get_one_page(url, headers): # 发起请求函数
try:
response = requests.get(url, headers=headers)
# print(response.encoding)
print("******") # 打印一次表示一页
if response.status_code == 200:
return response.content.decode()
return None
except RequestException: # 异常捕获
return None

def parse_one_poage(html): # 解析网页内容
pattern = re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a' +
'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
item = re.findall(pattern, html)

for item in item:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'author': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5] + item[6],

}

def save_results(result): # 电影信息保存到csv文件
with open('sgyz_maoyan.csv', 'a') as fp:
writer = csv.DictWriter(fp, fieldnames=['index', 'title', 'score', 'author', 'time', 'image'])
writer.writerow(result)

def save_img(html, j): # 图片保存
tree = etree.HTML(html)
img = tree.xpath('//img[@class="board-img"]/@data-src')
for item, i in zip(img, range(j + 1, j + 11)):
response = requests.get(item)
dir_name = 'top100_maoyan_img'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
poster_path = dir_name + '/' + str(i) + '.jpg'
with open(poster_path, "wb")as f:
f.write(response.content)
print("正在保存第%d张图片" % i) # 打印正在保存第几张图片

def main(i, num): # 主函数
headers = { # 请求头
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Cookie': '__mta=213153707.1584169820671.1584176850800.1584176888803.20; uuid_n_v=v1; uuid=DCA0118065C211EABFD4D144263730BB3384A5D996F4486FBD44DA930515B024; _csrf=486e9298701b5fcff171b29ffe8e7f8c022d3f3f6ccb5acca09b7b7bfbd8b34a; _lxsdk_cuid=170d7e11ea7c8-07bfb3c85ab355-6313f69-144000-170d7e11ea7c8; mojo-uuid=04a41724b07cf2e52447711156b8dea1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1584169820,1584169847; mojo-session-id={"id":"87c2c9fdc02b4d0f3196f1c7a8726afe","time":1584174922356}; _lxsdk=DCA0118065C211EABFD4D144263730BB3384A5D996F4486FBD44DA930515B024; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1584177991; __mta=213153707.1584169820671.1584176888803.1584177990939.21; mojo-trace-id=20; _lxsdk_s=170d8277745-ba6-3a2-65%7C%7C79'
, 'Host': 'maoyan.com'
, 'Pragma': 'no-cache'
, 'Sec-Fetch-Dest': 'document'
, 'Sec-Fetch-Mode': 'navigate'
, 'Sec-Fetch-Site': 'none'
, 'Sec-Fetch-User': '?1'
, 'Upgrade-Insecure-Requests': '1'
}

url = "https://maoyan.com/board/4?offset=" + str(i)
html = get_one_page(url, headers)
save_img(html, num)

for item in parse_one_poage(html):
save_results(item)

num += 10;
return num

if __name__ == '__main__':
save_results({'index': '排名', 'title': '名称', 'score': '分数', 'author': '主演', 'time': '上映日期', 'image': '封面网址'})
num = 0
for i in range(0, 100, 10):
num = main(i, num)

执行结果:

数据信息:

封面图片:

当然还可以加载到线程里,运行更快
点赞

发表评论

昵称和uid可以选填一个,填邮箱必填(留言回复后将会发邮件给你)
tips:输入uid可以快速获得你的昵称和头像

Title - Artist
0:00