Crawl_heavens
目录
1. 电影爬取
- main.py
# -*- encoding:utf-8 -*-
import sys
from PyQt5.QtWidgets import QDialog, QLabel, QPushButton, QLineEdit, QListWidget, QGridLayout, QComboBox, QMessageBox, QApplication, QMenuBar, QAction, QMainWindow, QWidget, QVBoxLayout
from PyQt5.QtCore import pyqtSlot, QThread, QObject
from PyQt5.QtGui import QIcon, QPixmap, QImage
from movieSource.MovieHeaven import MovieHeaven
class ImageWindow(QMainWindow):
def __init__(self, resources, title):
super(ImageWindow, self).__init__()
self.setWindowTitle(title)
self.central_widget = QWidget()
self.setCentralWidget(self.central_widget)
layout = QVBoxLayout(self.central_widget)
image = QImage(resources)
pixmap = QPixmap(resources)
image_label = QLabel(self)
image_label.setPixmap(pixmap)
image_label.resize(pixmap.width(), pixmap.height())
layout.addWidget(image_label)
class LayoutDialog(QMainWindow):
__slots__ = ['word', 'movie_name_label', 'movie_name_line_edit', 'movie_source_label', 'movie_source_combobox',
'search_push_button', 'tip_label', 'search_content_label', 'search_content_text_list']
def __init__(self):
super().__init__()
self.left = 300
self.top = 300
self.width = 400
self.height = 450
self.work = WorkThread()
self.init_widgets().init_layout().init_event()
def init_widgets(self):
self.setWindowTitle(self.tr("Search Movies"))
self.setGeometry(self.left, self.top, self.width, self.height)
self.movie_name_label = QLabel(self.tr("电影名称:"))
self.movie_name_line_edit = QLineEdit()
self.movie_source_label = QLabel(self.tr("选择片源:"))
self.movie_source_combobox = QComboBox()
self.movie_source_combobox.addItem(self.tr('电影天堂'))
self.search_push_button = QPushButton(self.tr("查询"))
self.tip_label = QLabel(self.tr("未开始查询..."))
self.search_content_label = QLabel(self.tr("查询内容:"))
self.search_content_text_list = QListWidget()
self.menu_bar = self.menuBar()
return self
def init_layout(self):
top_layout = QGridLayout()
top_layout.addWidget(self.movie_name_label, 0, 0)
top_layout.addWidget(self.movie_name_line_edit, 0, 1)
top_layout.addWidget(self.movie_source_label, 0, 2)
top_layout.addWidget(self.movie_source_combobox, 0, 3)
top_layout.addWidget(self.search_push_button, 0, 4)
top_layout.addWidget(self.tip_label, 3, 1)
top_layout.addWidget(self.search_content_label, 3, 0)
top_layout.addWidget(self.search_content_text_list, 4, 0, 2, 5)
main_frame = QWidget()
self.setCentralWidget(main_frame)
main_frame.setLayout(top_layout)
self.reward_window = ImageWindow('resources/wechat_reward.jpg', '赞赏')
self.watch_window = ImageWindow('resources/watch_wechat.jpg', '关注')
return self
def init_event(self):
self.search_push_button.clicked.connect(self.search)
self.search_content_text_list.itemClicked.connect(self.copy_text)
reward_action = QAction('赞赏', self)
reward_action.setIcon(QIcon('resources/reward.png'),)
reward_action.triggered.connect(self.reward)
watch_action = QAction('关注', self)
watch_action.setIcon(QIcon('resources/watch.png'),)
watch_action.triggered.connect(self.watch_wechat)
reward_menu = self.menu_bar.addMenu('支持作者')
reward_menu.addAction(reward_action)
reward_menu.addAction(watch_action)
def reward(self):
self.reward_window.show()
def watch_wechat(self):
self.watch_window.show()
def search(self):
self.tip_label.setText(self.tr("正在查询请稍后..."))
movie_name = self.movie_name_line_edit.text()
if movie_name:
self.work.render(movie_name, self.movie_source_combobox,
self.tip_label, self.search_content_text_list)
else:
self.critical("请输入电影名称!")
def critical(self, message):
"""
when the movieName is None,
remind users
"""
QMessageBox.critical(self, self.tr("致命错误"),
self.tr(message))
def copy_text(self):
copied_text = self.search_content_text_list.currentItem().text()
QApplication.clipboard().clear()
QApplication.clipboard().setText(copied_text)
self.slot_information()
def slot_information(self):
QMessageBox.information(self, "Success!", self.tr("成功将内容复制到剪贴板上!"))
class WorkThread(QThread): # 爬虫这里用了线程,之前没有想到
def __init__(self):
QThread.__init__(self)
def render(self, movie_name, movie_source_combobox, tip_label, search_content_text_list):
self.movies_list = []
self.movie_source_combobox = movie_source_combobox
self.movie_name = movie_name
self.tip_label = tip_label
self.search_content_text_list = search_content_text_list
self.start()
def get_select_movie_source(self, movie_name):
"""
according to the value of the QComboBox,
generate the right class of movie search
"""
movies, url, params = None, None, {"typeid": "1"}
select_source = self.movie_source_combobox.currentText()
if select_source == self.tr('电影天堂'):
movies = MovieHeaven()
url = "http://s.dydytt.net/plus/s0.php"
params["keyword"] = movie_name.encode('gb2312')
return movies, url, params
def run(self):
search_movies, url, params = self.get_select_movie_source(
self.movie_name)
print(search_movies,url, params)
try:
self.movies_list = search_movies.get_display_content(url, params)
except Exception as e:
self.movies_list.append(self.tr("过于频繁的访问"))
finally:
self.search_content_text_list.clear()
self.search_content_text_list.addItems(self.movies_list)
self.tip_label.setText(self.tr("查询结束"))
app = QApplication(sys.argv)
dialog = LayoutDialog()
dialog.show()
app.exec_()
- 电影天堂爬取代码
# -*- encoding:utf-8 -*-
import requests
import re
import urllib
from movieSource.fake_user_agent import useragent_random
from multiprocessing.dummy import Pool as ThreadPool
import sys
class MovieHeaven:
__slots__ = ['__pool', '__all_page_details_url_list', '__search_url', '__search_domain', '__download_domain',
'__params']
def __init__(self, parent=None):
self.__pool = ThreadPool(8)
self.__all_page_details_url_list = []
self.__search_url = "http://s.dydytt.net/plus/s0.php"
self.__search_domain = 'http://s.ygdy8.com'
self.__download_domain = 'http://www.ygdy8.com'
self.__params = {"typeid": "1",
"keyword": "leetao"}
def __get_headers(self):
return {"User-Agent": useragent_random()}
def __search_movie_results(self, url=None, params=None):
if url is None:
url = self.__search_url
temp_results = requests.get(
url, params=params, headers=self.__get_headers())
temp_results.encoding = 'gb2312'
return temp_results.text
def __get_movies_detail_page(self, searchResults):
"""
get the detailPage's url of movies by using regx
"""
pattern = re.compile(
r"<td\s+width='\d+%'><b><a\s+href='(.*\.html)'\s*>")
all_detai_pages = pattern.findall(searchResults)
return all_detai_pages
def __get_page_number_total(self, searchResults):
"""
get the total number of pages
"""
page_num_total_pattern = re.compile(
r"<td\s+width='30'><a\s+href='.+PageNo=(\d+)'\s*>")
page_num_total = page_num_total_pattern.findall(searchResults)
if len(page_num_total) == 0:
return -1
else:
return int(page_num_total[0])
def __next_page_detail(self, search_results):
"""
get the next page'url which lacks the pagenumber
"""
next_page_pattern = re.compile(
r"<td\s+width='30'><a href='(.*PageNo=)\d+'>")
next_page_url = next_page_pattern.findall(search_results)
return str(next_page_url[0])
def __get_search_content_by_url(self, next_page_url, page_num_total):
"""
get remain pages's url
"""
for page_no in range(2, page_num_total + 1):
if page_no is not None:
url = self.__search_domain + next_page_url + str(page_no)
res = self.__search_movie_results(url)
return self.__get_movies_detail_page(res)
def __get_movie_contents_url(self, url, params=None):
"""
get the first page of searching results
and get the remain pages's results
"""
first_page_results = self.__search_movie_results(url, params)
first_page_resultsList = self.__get_movies_detail_page(
first_page_results)
# get the remain pages's results
total_page_num = self.__get_page_number_total(first_page_results)
if total_page_num > 0:
next_page_url = self.__next_page_detail(first_page_results)
remain_page_results_list = self.__get_search_content_by_url(
next_page_url, total_page_num)
self.__all_page_details_url_list.extend(remain_page_results_list)
self.__all_page_details_url_list.extend(first_page_resultsList)
return self.__all_page_details_url_list
def __get_movie_down_url(self, down_page_url_list):
results_list = []
down_page_content_url_list = [
(self.__download_domain + url) for url in down_page_url_list]
for result_url_list in self.__pool.map(self.__get_down_page_content_url, self.__pool.map(self.__search_movie_results, down_page_content_url_list)):
if len(result_url_list) > 0:
results_list += result_url_list
self.__pool.close()
self.__pool.join()
return results_list
def __get_down_page_content_url(self, down_page_content):
download_url_list = []
ftp_down_pattern = re.compile(r'<td.+><a\s+href="(.+)"\s*>')
ftp_url_list = ftp_down_pattern.findall(down_page_content)
if len(ftp_url_list) > 0:
download_url_list.append(ftp_url_list[0])
magnet_down_pattern = re.compile(
r'<a\s+href="(magnet:\?xt=.+)"><strong>')
magnet_url_list = magnet_down_pattern.findall(down_page_content)
if len(magnet_url_list) > 0:
download_url_list.append(magnet_url_list[0].replace("amp;", ""))
return download_url_list
def get_display_content(self, url, params=None):
url_list = self.__get_movie_contents_url(url, params)
if len(url_list) == 0:
return ['Not Found']
else:
all_download_url_list = self.__get_movie_down_url(url_list)
movie_list = [
url for url in all_download_url_list if url is not None and url[-3:] not in ['zip', 'rar', 'exe']]
return movie_list
- face_agent.py
import random
FAKE_USER_AGENT = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/53"
]
def useragent_random():
return random.choice(FAKE_USER_AGENT)