python3爬虫学习(6)

文章目录[x]
  1. 0.1:CSV模块使用
  2. 0.2:百度翻译例子
  3. 0.3:爬取段子网的段子
  4. 0.4:大麦网演唱会

通用爬虫实练

前言

声明:本篇就是小生学习记录,文章仅供研究交流使用,清勿用作其他途径。

CSV模块使用

总结之前,先来说一下CSV模块。CSV本质上也是文本文件。使用之前先要导入该模块

#看一下读取文件操作
import csv
with open('result.csv', encoding='utf-8') as f:
    reader = [x for x in csv.DictReader(f)]

 

下面是文件的写操作,其实和python的文件操作有点像

writer = csv.DictWriter(f, fieldnames=['', '', ''])  #当然这里的的第二个参数可以传一个事先定义好的列表
writer.writeheader()#写入CSV列名行
writer.writerows()#将包含字典列表的全部写入CSV文件

writer.writerow()#写入单行
这里需要注意的是字典的key必须和filename相同,否则报错

百度翻译例子


 

import requests
import execjs # 执行js代码
import json
import sys

headers = { #请求头内容
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"content-length": "122",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"cookie": "", # 这里是cookie信息
"origin": "https://fanyi.baidu.com",
"referer": "https://fanyi.baidu.com/",
"user-agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36",
"x-requested-with": "XMLHttpRequest"
}
# query = sys.argv[1]
query = str(input("请输入一个英语:"))

with open('baidufanyi_sign.js', 'r', encoding='utf-8') as f:    # <strong>这里是重点</strong>运行百度翻译的js代码,生成 sign 值,因为不带sign的值无法请求成功
ctx = execjs.compile(f.read())
sign = ctx.call('e', query)

p = sign

post_data = { # 请求携带的数据
"query": query,
"from": "en",
"to": "zh",
"token": "38442359a447d90721cd18e04d8a7f17",
"sign": p,
# "simple_means_flag": "3",
}
post_url = "<a href="https://fanyi.baidu.com/basetrans" target="_blank"  rel="nofollow" >https://fanyi.baidu.com/v2transapi</a>" # 百度翻译的接口
r = requests.post(post_url, data=post_data, headers=headers)

dict_ret = json.loads(r.content.decode(), strict=False) # json.loads 把json 数据格式转换成 python 字典形式 dict

ret = dict_ret["trans_result"]['data'][0]['dst']
print("翻译结果:",ret)

这里面最重要的就是用百度的js函数计算sign的值,因为如果没有该值就无法获得响应.(用到的js文件在本文末尾会贴出)

还有就是上面的代码没有使用面向对象,并且只能翻译英文到中文,如果要判别语言,可以再写一个函数用来获取语言并且传送给data字典

看一下执行效果

爬取段子网的段子

这里用的是面向对象写的

 

# coding=utf-8

import requests
import re
import json

class DuanZi:
def __init__(self):
self.start_url = "https://duanziwang.com/"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"}
self.url_temp = "https://duanziwang.com/page/{}/"
def parse_url(self,url):#发送请求
response = requests.get(url,headers=self.headers)
print(url)
return response.content.decode()

def get_first_page_content_list(self,html_str):
content_list = re.findall(r"<p>(.*?)</p>",html_str,re.S)
return content_list

def save_content_list(self,content_list):
with open("duzni.txt","a") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print("保存成功")

def run(self):#实现主要逻辑
url_page = int(input("请输入要爬取的页数:"))
#1.start_url
num = 1
while True:
#2.发送请求,获取响应
html_str = self.parse_url(self.url_temp.format(num))
#3.提取数据
content_list = self.get_first_page_content_list(html_str)
#4.保存
self.save_content_list(content_list)
#5.构造下一页的url地址
# next_url = self.url_temp.format(num)
if url_page == num:#跳出循环
break
num += 1

if __name__ == '__main__':
daunzi =DuanZi()
daunzi.run()

运行程序

 

保存的TXT文件

大麦网演唱会


 

主要源码实现

import requests
import csv
import json

headers = {
"user-agent": "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Mobile Safari/537.36"}
url = "https://search.damai.cn/searchajax.html?keyword=&cty=&ctl=%E6%BC%94%E5%94%B1%E4%BC%9A&sctl=&tsg=0&st=&et=&order=1&pageSize=30&currPage=2&tn="
source = requests.get(url, headers=headers).content.decode()

dict_data = json.loads(source)

# print(dict_data)

# 将需要的爬取的字典数据存储在变量中
need_spider_data = dict_data["pageData"]["resultData"]
print(need_spider_data)
# 构造存储头列表
data_key = []
for item in need_spider_data[0]:
data_key.append(item)
# 打印测试
# print(data_key)
# 构建属性列表
# list = ['actors', 'categoryname', 'cityname', 'description', 'price', 'pricehigh', 'showstatus', 'showtime', 'subcategoryname', 'venue', 'venuecity', 'verticalPic']

list = data_key

# 此处出现保存,报错为缺少字段,因此追加一个字段
list.append('favourable')
# 测试list

with open("damaiwang" + ".csv", "w", newline="", encoding='utf8') as f:
# 传入头数据,即第一行数据
writer = csv.DictWriter(f, list)
writer.writeheader()
for row in need_spider_data:
writer.writerow(row)

运行程序后生成的CSV文件

 

上面用到的js代码

function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}

var i = "320305.131321201"
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}

 

 

 

 

 

点赞

发表评论

昵称和uid可以选填一个,填邮箱必填(留言回复后将会发邮件给你)
tips:输入uid可以快速获得你的昵称和头像

Title - Artist
0:00