维护一组浏览器,实现每分钟1000次查询。DriverPool使用变幻版只初始化一次的单例模式。维护每个浏览器的当前是否使用的状态。
不需要等待请求来了,临时开浏览器,开一个浏览器会耽误6秒钟。
可以在程序启动后,随便使用命令杀死slenium,,不怕被别人杀死,不需要重启程序就能保证长久正常运行。
主要使用了 mixin继承、变化版单例模式、鸭子类、桥接模式、上下文管理器,引入了资源池的概念,自动选择一个当前未被使用的浏览器。
使用了池固定了浏览器最大数量,避免了直接开孤立的slenium driver,当并发大的时候代码突然启动几百上千个浏览器,会导致系统突然性能衰竭。
# coding=utf8"""浏览器资源池维护。不需要等待有任务来了,再重开浏览器。新开浏览器会耽误6秒时间。抗杀抗oom,可以随便在程序启动后,批量杀死浏览器,程序会自动开启。"""import timeimport osfrom pathlib import Pathfrom threading import Lockfrom urllib.error import URLErrorfrom selenium.webdriver import DesiredCapabilitiesfrom selenium.common.exceptions import WebDriverExceptionfrom selenium import webdriverfrom selenium.webdriver.support.wait import WebDriverWaitfrom app.utils_ydf import LoggerMixin, BoundedThreadPoolExecutor, decorators, LogManagerclass NoAvailableDriverError(Exception): passclass DriverItem: def __init__(self, driver, ): self.driver = driver self.create_time = time.time() self.is_using = False self.last_use_time = time.time() def __str__(self): # noinspection PyRedundantParentheses return (f"{time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(self.create_time))} {self.is_using} {time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(self.last_use_time))} {self.driver}")class PhantomjsItemBuilder(LoggerMixin): # noinspection PyBroadException def create_a_driver_item(self): t0 = time.time() capabilities = DesiredCapabilities.PHANTOMJS.copy() capabilities['platform'] = "WINDOWS" capabilities['version'] = "10" capabilities['phantomjs.page.settings.loadImages'] = False # capabilities['phantomjs.page.settings.userAgent'] = ( # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " # "Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") capabilities['phantomjs.page.settings.userAgent'] = ( "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36") service_args = ['--load-images=no', '--disk-cache=yes', '--ignore-ssl-errors=true'] self.logger_with_file.info('创建一个driver中。。。。。。') driver = None if os.name == 'posix': # driver = webdriver.PhantomJS(executable_path=Path(__file__).parent / Path('phantomjs'), desired_capabilities=capabilities, service_args=service_args) try: driver = webdriver.PhantomJS(desired_capabilities=capabilities, service_args=service_args) except Exception as e: self.logger.exception(f'从环境变量获取driver路径失败,改为从/usr/local/bin文件夹获取 {e}') try: driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs', desired_capabilities=capabilities, service_args=service_args) except Exception as e: self.logger.exception(f'从/usr/local/bin/phantomjs启动失败 {e}') else: driver = webdriver.PhantomJS(desired_capabilities=capabilities, service_args=service_args) # driver.maximize_window() driver.set_window_size(390, 713) driver.set_page_load_timeout(10) # driver.implicitly_wait(10) self.logger.info(f'创建一个浏览器耗时{time.time() - t0}') return DriverItem(driver)class ChromeItemBuilder(LoggerMixin): def create_a_driver_item(self): self.logger.info('创建一个driver中。。。。。。') t0 = time.time() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-images') chrome_options.binary_location = r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe' # prefs = {"profile.managed_default_content_settings.images": 2} prefs = { 'profile.default_content_setting_values': { # 也可以这样写,两种都正确 # 'profile.default_content_settings': { 'images': 2, # 不加载图片 'javascript': 1, # 2不加载JS "User-Agent": 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"', # 更换UA } } chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument('blink-settings=imagesEnabled=false') # 这句禁用图片才能生效,上面两个禁用图片没起到效果。 driver = webdriver.Chrome(chrome_options=chrome_options) # driver.maximize_window() driver.set_window_size(390, 713) driver.set_page_load_timeout(100) driver.implicitly_wait(100) self.logger.info(f'创建一个浏览器耗时{time.time() - t0}') return DriverItem(driver)class DriverPool(LoggerMixin): lock = Lock() def __new__(cls, *args, **kwargs): if not hasattr(cls, '_instance'): self = super().__new__(cls, ) cls._instance = self self.__custom_init__(*args, **kwargs) return cls._instance def __custom_init__(self, driver_item_num=10, driver_name=1): """ :param driver_item_num:浏览器数量 :param driver_name: 浏览器种类 1为phantomsj,2为chrome :return: """ self.driver_item_list = list() self._driver_item_num = driver_item_num self.driver_item_builder = PhantomjsItemBuilder() if driver_name == 1 else ChromeItemBuilder() self.logger_with_file.info(f'准备初始化{driver_item_num}个浏览器') self._has_init_all_driver_item = False self._init_time = 0 self._init_all_driver_item() def _init_all_driver_item(self): if time.time() - self._init_time > 60: self._init_time = time.time() self.logger.warning('杀死残留的phantomjs进程') # 此处的命令不用怕误杀其它地方的phantomjs,上下文管理器使用被杀的浏览器会自动启动。 if os.name == 'posix': os.system('ps -aux|grep phantomjs|grep -v grep|cut -c 9-15|xargs kill -9') else: os.system('taskkill /F /im phantomjs.exe') t0 = time.time() self.driver_item_list.clear() # 一定需要清空原来的。 def _inner(this: DriverPool): driver_item = this.driver_item_builder.create_a_driver_item() this.driver_item_list.append(driver_item) thread_pool = BoundedThreadPoolExecutor(self._driver_item_num) [thread_pool.submit(_inner, self) for _ in range(self._driver_item_num)] # 亲测多线程创建10个浏览器,比一个接一个的创建速度要快很多。 thread_pool.shutdown() self._has_init_all_drivers = True self.logger.info(f'所有浏览器初始化创建成功,耗时 {time.time() - t0}秒 {len(self.driver_item_list)} {self.driver_item_list}') def borrow_a_driver_item(self): with self.lock: current_using_number = 0 current_not_using_number = 0 for driver_item in self.driver_item_list: if driver_item.is_using: current_using_number += 1 else: current_not_using_number += 1 self.logger.debug(f'当前正在使用的浏览器数量是{current_using_number},闲置的浏览器数量是{current_not_using_number}') for index, driver_item in enumerate(self.driver_item_list): if driver_item.is_using is False: if time.time() - driver_item.create_time > 3600: self.logger.debug('防止phantomjs内存泄漏,关闭并重新创建一个浏览器') self.driver_item_list.pop(index) driver_item.driver.quit() driver_item = self.driver_item_builder.create_a_driver_item() self.driver_item_list.insert(index, driver_item) driver_item.is_using = True return driver_item raise NoAvailableDriverError('当前没有可用的浏览器。。。。。。。。。。。。') @staticmethod def give_back_a_driver_item(driver_item: DriverItem): driver_item.is_using = False driver_item.last_use_time = time.time()class DriverContext: def __init__(self): self.driver_pool = DriverPool() self.driver_item = None self.start_using_time = time.time() def __enter__(self): self.driver_item = self.driver_pool.borrow_a_driver_item() self.driver_pool.logger_with_file.debug(f'当前使用的浏览器是 {self.driver_item}') return self.driver_item.driver def __exit__(self, exc_type, exc_val, exc_tb): self.driver_pool.logger.info(f'此浏览器 {self.driver_item} 占用时间为 {time.time() - self.start_using_time}秒') self.driver_pool.give_back_a_driver_item(self.driver_item) if exc_type == URLError: # 如果phantomjs被被手动杀死或者oom了,再次使用这个phatntomjs会出这个URLError错,重新生成浏览器池。 self.driver_pool._init_all_driver_item() if exc_type and issubclass(exc_type, WebDriverException): self.driver_pool.logger.error(f'selenium发生错误 ,错误类型--> {exc_type} 错误原因--> {exc_val}') # return Trueif __name__ == '__main__': logger = LogManager('driver_pool_test').get_logger_and_add_handlers() DriverPool(50) if not Path('/picture').exists(): Path('/picture').mkdir() @decorators.tomorrow_threads(40) def f(): with DriverContext() as driver: # 需要使用with语法来使用浏览器,否则需要手动额外处理一些问题和维护浏览器的使用状态。 logger.debug(f'使用的浏览器是--> {driver}') driver.get('http://m.elong.com/ihotel/283904/?inDate=2018-12-12&outDate=2018-12-13&roomPerson=1|2') driver.save_screenshot(f'/picture/{time.time()}.png') WebDriverWait(driver, 10, 0.2).until( lambda driverx: driverx.find_element_by_css_selector('#detail-mapping-box > li:nth-child(1) > div.prodjh_list_box.clearfix > div.detail-mrooom-mapping-product > div.dprodtname')) logger.info(f'页面内容长度是: {len(driver.page_source)}') driver.save_screenshot(f'/picture/{time.time()}.png') [(time.sleep(0.1), f()) for _ in range(50000)]
使用如图,由于不需要对每次请求都频繁创建和摧毁浏览器,所以打开网页速度很快。