在Python多线程爬虫中处理异常,可以采用以下方法:
- 使用
try-except
语句:在爬虫的每个线程中,使用try-except
语句来捕获可能发生的异常。这样,当异常发生时,程序不会崩溃,而是执行except
块中的代码。
import threading import requests from bs4 import BeautifulSoup def crawl(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 爬虫逻辑 except requests.exceptions.RequestException as e: print(f"请求异常:{e}") except Exception as e: print(f"其他异常:{e}") # 创建线程 threads = [] for i in range(10): t = threading.Thread(target=crawl, args=("https://example.com",)) t.start() threads.append(t) # 等待所有线程完成 for t in threads: t.join()
- 使用
logging
模块:使用logging
模块记录异常信息,而不是直接打印到控制台。这样可以更好地控制日志输出,便于分析和调试。
import logging import threading import requests from bs4 import BeautifulSoup logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') def crawl(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 爬虫逻辑 except requests.exceptions.RequestException as e: logging.error(f"请求异常:{e}") except Exception as e: logging.error(f"其他异常:{e}") # 创建线程 threads = [] for i in range(10): t = threading.Thread(target=crawl, args=("https://example.com",)) t.start() threads.append(t) # 等待所有线程完成 for t in threads: t.join()
- 使用
concurrent.futures.ThreadPoolExecutor
:这个类提供了更高级的线程池管理功能,可以更方便地处理线程异常。
import concurrent.futures import requests from bs4 import BeautifulSoup def crawl(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 爬虫逻辑 except requests.exceptions.RequestException as e: print(f"请求异常:{e}") return None except Exception as e: print(f"其他异常:{e}") return None urls = ["https://example.com"] * 10 with concurrent.futures.ThreadPoolExecutor() as executor: results = executor.map(crawl, urls) for result in results: if result is not None: # 处理结果
这些方法可以帮助你在Python多线程爬虫中处理异常,确保程序的稳定运行。