# prg#5 # ptt_movie #5 電影版爬蟲 # 取出上下一頁資料 # 完整程式版本 加入get_all_href 顯示下一頁 抓取文章內容 抓取內文 import requests from bs4 import BeautifulSoup url="https://www.ptt.cc/bbs/movie/index.html" #ptt看板網址 n = 2 #需要抓取幾頁資料 def get_article_content(article_url): r = requests.get(article_url) soup = BeautifulSoup(r.text, "html.parser") results = soup.select('span.article-meta-value') if results: print('作者:', results[0].text) print('看板:', results[1].text) print('標題:', results[2].text) print('時間:', results[3].text) print('-------------------------') def get_article_content_inside(article_url): r = requests.get(article_url) soup = BeautifulSoup(r.text, "html.parser") articles = soup.find_all('div','push') for article in articles: #去除掉冒號和左右的空白 messages = article.find('span','f3 push-content').getText().replace(':','').strip() print('messages = ', messages) print('-------------------------') # 抓取該頁所有的標題 def get_all_href(url): r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") results = soup.select("div.title") for item in results: a_item = item.select_one("a") title = item.text if a_item: #原先程式 #print(title, 'https://www.ptt.cc'+ a_item.get('href')) #改成呼叫 get_article_content去抓取內容 get_article_content(article_url='https://www.ptt.cc'+ a_item.get('href')) get_article_content_inside(article_url='https://www.ptt.cc'+ a_item.get('href')) print('----------------- Next Page --------------') # 抓取前n頁所有的連結 for page in range(1, n+1): r = requests.get(url) soup = BeautifulSoup(r.text,"html.parser") btn = soup.select('div.btn-group > a') up_page_href = btn[3]['href'] next_page_url = 'https://www.ptt.cc' + up_page_href url = next_page_url get_all_href(url = url)