# -*- coding: utf-8 -*-
from login.login import Login as Login
import requests
import http.cookiejar as cookielib
import configparser
from bs4 import BeautifulSoup
import sys
import redis
import json
import math
import pymysql
import traceback
import threading
import time
import random
# 获取配置
cfg = configparser.ConfigParser()
cfg.read("config.ini")
class GetUser(threading.Thread):
session = None
config = None
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
"Origin": "https://www.zhihu.com/",
"Upgrade-Insecure-Requests": "1",
"Content-Type": "application/json, text/plain, */*",
"Pragma": "no-cache",
"Accept-Encoding": "gzip, deflate",
'Connection': 'close',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
retry = 0 # 重试次数
redis_con = ''
counter = 0 # 被抓取用户计数
xsrf = ''
db = None
db_cursor = None
max_queue_len = 1000 # redis带抓取用户队列最大长度
ua = (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
)
sleep_time = 1
def __init__(self, threadID=1, name=''):
# 多线程
print("线程" + str(threadID) + "初始化")
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
try:
print("线程" + str(threadID) + "初始化成功")
except Exception as err:
print(err)
print("线程" + str(threadID) + "开启失败")
self.threadLock = threading.Lock()
# 获取配置
self.config = cfg
# 初始化session
requests.adapters.DEFAULT_RETRIES = 5
self.session = requests.Session()
self.session.cookies = cookielib.LWPCookieJar(filename='cookie')
self.session.keep_alive = False
try:
self.session.cookies.load(ignore_discard=True)
except:
print('Cookie 未能加载')
finally:
pass
# 创建login对象
'''
lo = Login(self.session)
lo.do_login()
'''
# 初始化redis连接
try:
redis_host = self.config.get("redis", "host")
redis_port = self.config.get("redis", "port")
self.redis_con = redis.Redis(host=redis_host, port=redis_port, db=0)
# 刷新redis库
# self.redis_con.flushdb()
except Exception as err:
print("请安装redis或检查redis连接配置")
sys.exit()
# 初始化数据库连接
try:
db_host = self.config.get("db", "host")
db_port = int(self.config.get("db", "port"))
db_user = self.config.get("db", "user")
db_pass = self.config.get("db", "password")
db_db = self.config.get("db", "db")
db_charset = self.config.get("db", "charset")
self.db = pymysql.connect(host=db_host, port=db_port, user=db_user, passwd=db_pass, db=db_db,
charset=db_charset)
self.db_cursor = self.db.cursor()
except Exception as err:
print("请检查数据库配置")
sys.exit()
# 初始化系统设置
self.max_queue_len = int(self.config.get("sys", "max_queue_len"))
self.sleep_time = float(self.config.get("sys", "sleep_time"))
# 获取首页html
def get_index_page(self):
index_url = 'http://222.178.203.72:19005/whst/63/=vvvzyghgtzbnl//explore'
try:
index_html = self.session.get(index_url, headers=self.headers, timeout=35)
except Exception as err:
# 出现异常重试
print("获取页面失败,正在重试......")
print(err)
traceback.print_exc()
return None
finally:
self.save_cookie()
pass
return index_html.text
# 获取首页上的用户列表,存入redis
def get_index_page_user(self):
index_html = self.get_index_page()
if not index_html:
return
BS = BeautifulSoup(index_html, "html.parser")
user_a = BS.find_all("a", class_="author-link") # 获取用户的a标签
for a in user_a:
if a:
href = a.get('href')
self.add_wait_user(href[(href.rindex('/')) + 1:])
else:
print("获取首页author-link失败,跳过")
continue
# 获取粉丝页面,接口信息
def get_follower_page(self, name_url, offset=0, limit=20):
user_page_url = 'http://222.178.203.72:19005/whst/63/=vvvzyghgtzbnl//api/v4/members/' + str(
name_url) + '/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=' + str(
offset) + '&limit=' + str(limit)
try:
index_html = self.session.get(user_page_url, headers=self.headers, timeout=35)
except Exception as err:
# 出现异常重试
print("失败name_url:" + str(name_url) + "获取页面失败,放弃该用户")
print(err)
traceback.print_exc()
return None
finally:
self.save_cookie()
pass
return index_html.text
# 分析粉丝接口获取用户的所有粉丝用户
# @param follower_page get_follower_page()中获取到的页面,这里获取用户hash_id请求粉丝接口获取粉丝信息
def get_all_follower(self, name_url):
follower_api = self.get_follower_page(name_url)
# 判断是否获取到页面
if not follower_api:
return
try:
data = json.loads(follower_api)
# 获取关注者数量
follower_num = int(data['paging']['totals'])
is_end = bool(data['paging']['is_end'])
except Exception as err:
print(err)
traceback.print_exc()
print("获取关注者列表失败,放弃")
return
# 获取关注者列表
per_page = 20
# 开始获取所有的关注者 math.ceil(follower_num/20)*20
for i in range(0, int(math.ceil(follower_num / per_page)) * per_page, per_page):
try:
follower_api = self.get_follower_page(name_url, i, per_page)
data = json.loads(follower_api)
is_end = bool(data['paging']['is_end'])
for user in data['data']:
self.add_wait_user(user['url_token']) # 保存到redis
if is_end:
break
except Exception as err:
print("获取关注者列表失败,继续循环")
print(err)
continue
pass
# 获取正在关注api
def get_following_page(self, name_url, offset=0, limit=20):
user_page_url = 'http://222.178.203.72:19005/whst/63/=vvvzyghgtzbnl//api/v4/members/' + str(
name_url) + '/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=' + str(
offset) + '&limit=' + str(limit)
tr
博士僧小星
- 粉丝: 2442
- 资源: 5998
最新资源
- kde-l10n-Thai-4.10.5-2.el7.x64-86.rpm.tar.gz
- 三菱PLC FX3U画圆程序的详解与编写指南:零基础入门教程,三菱plc程序三菱FX3U画圆程序,只要弄明白这个程序,就可以非常了解整个项目的程序如何去编写,从哪里开始下手,程序流程清晰明了,每一句都
- kde-l10n-Turkish-4.10.5-2.el7.x64-86.rpm.tar.gz
- 电力电子仿真模型中的Buck与Boost变换器、单相与三相逆变技术及并网逆变、三相PWM整流器与晶闸管整流技术的研究,电力电子仿真模型 Buck变器 Boost变器 单相 三相逆变,并网逆变 三相PW
- kde-l10n-Ukrainian-4.10.5-2.el7.x64-86.rpm.tar.gz
- kde-l10n-Uyghur-4.10.5-2.el7.x64-86.rpm.tar.gz
- 三菱Fx3u MODBUS RTU从站程序详解:支持H03和H10功能码,定义读写区域与通信优化,含注释及字节交换功能,三菱Fx3u MODBUS RTU 从站程序 支持H03和H10功能码 下载
- kde-l10n-Vietnamese-4.10.5-2.el7.x64-86.rpm.tar.gz
- 海康相机图片采集与目标检测:基于YOLOv5与工业视觉的应用方案,yolov5 海康相机 工业视觉 海康相机采集的图片使用yolov5进行目标检测,yolov5推理使用c++封装dll,调用海康sdk
- kde-l10n-Walloon-4.10.5-2.el7.x64-86.rpm.tar.gz
- Java毕设项目:基于SpringBoot+mybatis+maven+mysql实现的员工绩效考核管理系统【含源码+数据库+任务书+答辩PPT+毕业论文】
- MAKINO系列机床操作与维修设定指南:PRO3操作、V55维护、报警表及作业规范手册,MAKINO 牧野 PRO3 维修设定操作 A55 PRO3操作说明书 日文.pdf A55卧加工作台旋转后加
- kde-print-manager-4.10.5-4.el7.x64-86.rpm.tar.gz
- 基于pga411的汽车级旋转变压器解码器:角度信号转绝对值,USB显示,简单调零与位置指示功能,pga411旋转变压器 旋变解码 旋变调零旋变信号解码器旋转变压器转成角度信号正余弦转成绝对值 使用汽
- kde-runtime-4.10.5-11.el7.x64-86.rpm.tar.gz
- Matlab Cplex下的储能电站服务冷热电多微网双层优化配置策略:BMILP问题与KKT等效模型探讨,Matlab Cplex代码:基于储能电站服务的冷热电多微网系统双层优化配置 参考电网技术的
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈