Fork me on GitHub

Python爬取糗事百科段子

话不多说,直接上干货。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import urllib.request as request
from bs4 import BeautifulSoup
import threading
page = int(input("请输入需需要爬去的页数:"))
restlus = []
def getHTML(url):
headers = {'User-Agent':
'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
req = request.Request(url, headers=headers)
return request.urlopen(req).read()
def get_qiubai_results(url):
soup = BeautifulSoup(getHTML(url), 'lxml')
contents = soup.find_all('div', {'class':'content'})
for x in contents:
str = x.find('span').getText('\n','<br/>')
restlus.append(str)
return restlus
def writeData():
f = open('test.txt', 'w')
sep = '\n\n'
f.write(sep.join(restlus))
f.close()
return
def main():
# 遍历url
for x in range(1, page):
# 文字
url = 'https://www.qiushibaike.com/text/page/%d/?s=4989915' % x
# 热图
# url = 'http://www.qiushibaike.com/8hr/page/%d/?s=4952526' % x
for x in get_qiubai_results(url):
# print(x + '\n\n')
pass
writeData()
return
if __name__ == '__main__':
t1 = threading.Thread(target=main)
t1.start()