Python例子

Python例子-爬取糗事百科首页内容

   

import urllib.request
import re
import json
def jokeCrawler(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    }
    req = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(req)
    HTML = response.read().decode('utf-8')
    pat =r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
    re_joke = re.compile(pat,re.S) # re.S匹配换行
    div_list = re_joke.findall(HTML) # 返回一个列表
    # print(div_list)
    # print(len(div_list))
    dic = {}
    for div in div_list:
        # 用户名
        re_u = re.compile(r'<h2>(.*?)</h2>',re.S)
        username = re_u.findall(div)
        # print(username)
        username = username[0].replace('\n','')
        # print(type(username))
        # print(username)
        # dic[username] = username

        # 段子内容
        re_c = re.compile(r'<div class="content">\n<span>(.*?)</span>',re.S)
        content = re_c.findall(div)
        content = content[0].replace('\n','')

        dic[username] = content
        # print(content)
    path = '/home/python/Desktop/pc/test/qiushi.json'
    with open(path,'w') as f:
        f.write(json.dumps(dic))
    return dic

url = 'https://www.qiushibaike.com/text/page/1/'
info = jokeCrawler(url)

print(len(info))
print(info)


最后修改:2020年3月29日 22:35