倚栏听风倚栏听风

不积跬步
无以至千里

用 Python 爬取纽约时报做出来的词频表

这两天心血来潮,想用 Python 爬一下新闻,看看词频长啥样,整个过程让我对 Python 有了更多的了解,还是很兴奋的。

import re
import sys
import io
import requests
import json
from bs4 import BeautifulSoup

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
word_map = {}
all = 0
proxy = {'https': 'http://localhost:1081'}
proxy = {}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}


def analyze(text):
    len1 = len(re.findall(r'([a-zA-Z]{2,})', text))
    global all
    all = all + len1
    for w in re.finditer(r'([a-zA-Z]{2,})', text):
        word = w.group().lower()
        k = word_map.get(word)
        if not k:
            word_map[word] = 1
        else:
            word_map[word] = k + 1


def write_file():
    global word_map
    word_map = sorted(word_map.items(), key=lambda item: item[1], reverse=True)
    print(word_map)
    print('all:', all)
    print('map:', len(word_map))
    with open('./word.txt', 'w') as f:
        for k, v in word_map:
            f.write(k + '\t' + str(v) + '\n')
        f.write('all:' + str(all) + '\n')
    print('end')


def get_nynews():
    home = 'https://www.nytimes.com'
    resp = requests.get(home, headers=headers, proxies=proxy)
    level1_set = set()
    for url in re.finditer(r'href="(https://www.nytimes.com/section/\w+)', resp.text):
        level1_set.add(url.group(1))
    for url in level1_set:
        resp = requests.get(url, headers=headers, proxies=proxy)
        article_set = set()
        for url in re.finditer(r'href="(/\d{4}.+?html)', resp.text):
            article_set.add('https://www.nytimes.com' + url.group(1))

        for url in article_set:
            print('get :', url)
            r = requests.get(url, headers=headers, proxies=proxy)
            soup = BeautifulSoup(r.text, 'html.parser')
            for l in soup.find_all('p', {'class': 'css-exrw3m'}):
                analyze(l.text)


def start():
    get_nynews()
    write_file()


if __name__ == '__main__':
    start()

整理出来的词频表在这,https://shimo.im/sheets/6JgGcggcvCPGcvTR/poI4w/

本原创文章未经允许不得转载 | 当前页面:倚栏听风 » 用 Python 爬取纽约时报做出来的词频表

评论