python爬取Twitter正在关注列表

2021年12月3日 2703点热度 1人点赞 0条评论

问题描述

最近想要用爬虫爬一下 Twitter 正在关注的用户的列表,抓包看了一下也没搞明白怎么获取的数据,也没有开发者账号,就只能用 selenium 和 webdriver 爬一下。

解决方案

selenium 不知道为啥无法使用 cookie 登录 twitter,那就直接用 javascript 获取关注列表,该方法为半自动,需要自己判断啥时候停止。

使用浏览器进入关注列表 https://twitter.com/xxxxxx/following,进入浏览器开发者模式控制台,执行以下命令开始自动翻页并获取代码。

let alldata = [];
let count = 0
const down = setInterval(function () {
    console.log("count", count)
    count += 1
    let t = new XMLSerializer().serializeToString(document.evaluate("//div[@aria-label='时间线:正在关注']", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue)
    alldata.push(t)
    window.scrollTo(0, document.body.scrollHeight)
}, 2000)

等到关注列表翻到最后一个,执行以下命令停止并导出 html 文件。

clearInterval(down)
let download = document.createElement('a');
download.download = +new Date() + ".html";
download.style.display = 'none';
let blob = new Blob([alldata.join("")]);
download.href = URL.createObjectURL(blob);
document.body.appendChild(download);
download.click();
document.body.removeChild(download);

刷新页面,重复进行多次,将导出的 html 文件放到一个空白文件夹内,执行以下 python 脚本整理关注列表,SAVE_PATH 修改为导出 html 文件所在的文件夹。

from bs4 import BeautifulSoup
from lxml import etree
import json
import os

SAVE_PATH = ""

ALREADY = []

DATA = []


def resolve_output_file(content):
    global ALREADY
    global DATA
    dom = etree.HTML(str(content))
    for item in dom.xpath('//div[@data-testid="cellInnerDiv"]'):
        user = {}
        d = etree.HTML(etree.tostring(
            item, pretty_print=False).decode("utf8"))
        try:

            ltrs = d.xpath('//div[@dir="ltr"]')
            user['name'] = BeautifulSoup(etree.tostring(
                ltrs[0], pretty_print=False).decode("utf8"), "lxml").text.replace("\n", "")
            user['uid'] = BeautifulSoup(etree.tostring(
                ltrs[2], pretty_print=False).decode("utf8"), "lxml").text.replace("\n", "")
            print(user['uid'], end=" -> ")
            if user['uid'] in ALREADY:
                print("SKIP")
                continue
            ALREADY.append(user['uid'])
            user['url'] = "https://twitter.com/%s" % user['uid'].replace(
                "@", "")
        except Exception as e:
            print("Get Uid And Name Fail %s" % str(e))
            continue

        try:
            imgs = d.xpath('//img[@draggable="true"]')
            user['avatar'] = BeautifulSoup(etree.tostring(
                imgs[0], pretty_print=False).decode("utf8"), "lxml").find("img").attrs['src']
        except Exception as e:
            print("Get Avatar Fail %s" % str(e))
            continue

        try:
            descs = d.xpath('//div[@dir="auto"]')
            user['desc'] = BeautifulSoup(etree.tostring(
                descs[-1], pretty_print=False).decode("utf8"), "lxml").text.replace(" ", "").replace("\n", "")

        except Exception as e:
            print("Get Description Fail %s" % str(e))

        DATA.append(user)
        print("OK")


for fn in os.listdir(SAVE_PATH):
    with open("%s/%s" % (SAVE_PATH, fn), "r", encoding="utf8") as f:
        resolve_output_file(f.read())

print(len(DATA))

with open("twitter_following.json", "w", encoding="utf8") as fd:
    fd.write(json.dumps(DATA))

关注列表保存到 twitter_following.json 文件内。

解决方案(已失效)

就用 selenium 打开正在关注页面并模拟下滑,beautifulsoup4 和 lxml 解析html,速度很慢,结果也不准确,还不能获取到全部关注的人。

获取cookie

登录 twitter 获取 cookie,直接用 editthiscookie 插件导出 json 保存到 cookie.json。

安装python库和webdriver

pip install beautifulsoup4 lxml selenium

这里使用的是 edge 浏览器,下载 msedgedriver.exe 放在 path 的目录中。

Microsoft Edge Driver - Microsoft Edge Developer

python代码

import json
import time
import lxml
import bs4
from selenium import webdriver


def get_user_data(h):
    users = []
    soup = bs4.BeautifulSoup(h, "lxml")
    for item in soup.find('div').contents:
        if isinstance(item, bs4.NavigableString):
            continue
        if isinstance(item, bs4.Tag):
            a = item.find("a")
            if a is None:
                continue
            url = "https://twitter.com" + a.attrs['href']
            dom = lxml.etree.HTML(str(item))
            uid = dom.xpath('//div[@dir="ltr"]')[0][0].text
            name = dom.xpath('//div[@dir="auto"]')[0][0][0].text
            desc = ""
            try:
                desc_select = dom.xpath('//div[@dir="auto"]')[3]
                desc = bs4.BeautifulSoup(lxml.etree.tostring(
                    desc_select).decode(), "lxml").text.replace("\n", "").replace(" ", "")
            except Exception as _:
                pass
            avatar = dom.xpath('//img[@draggable="true"]')[0].attrib['src']
            users.append({
                'url': url,
                'uid': uid,
                'name': name,
                'desc': desc,
                'avatar': avatar
            })

    return users


def get_cookies():
    with open("cookie.json") as f:
        return json.loads(f.read())


def get_following_list(user):
    alldatas = []
    options = webdriver.EdgeOptions()
    options.add_argument("--proxy-server=http://192.168.2.2:7890")
    driver = webdriver.Edge(options=options)

    driver.get("https://twitter.com/")

    for cookie in get_cookies():
        driver.add_cookie(
            {"name": cookie['name'], "value": cookie['value']})

    driver.get("https://twitter.com/%s/following" % user)

    time.sleep(5)

    last_element = None

    while True:
        html = driver.find_element_by_xpath(
            "//div[@aria-label='时间线:正在关注']").get_attribute("innerHTML")
        datas = get_user_data(html)
        if len(datas) == 0:
            print("找不到数据")
            break
        last = datas[-1]['uid']

        if last == last_element:
            print("已加载全部")
            break
        else:
            print("获取下一组")
            last_element = last
        new = 0
        for item in datas:
            if item not in alldatas:
                new += 1
                alldatas.append(item)
        print("获取到%s, 新数据%s, 总计%s" % (len(datas), new, len(alldatas)))
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(3)

    print("总计%s" % len(alldatas))
    driver.quit()

    return alldatas


data = get_following_list("Rob_Flaherty") #用户id,url中的

for user in data:
    print(user)

更好的解决方案

大意了,忘了直接搜现有的程序。

twintproject/twint: An advanced Twitter scraping & OSINT tool written in Python that doesn't use Twitter's API, allowing you to scrape a user's followers, following, Tweets and more while evading most API limitations. (github.com)

KAMINO

这个人很懒,什么都没留下

文章评论