# prg#1 # ptt_movie 電影版爬蟲 import requests from bs4 import BeautifulSoup article_href = [] r = requests.get("https://www.ptt.cc/bbs/movie/index.html") #指定要抓取的版網址 soup = BeautifulSoup(r.text,"html.parser") results = soup.select("div.title") # 指定抓取div.title部分網頁 print(results) # 取出該頁所有的連結 for item in results: item_href = item.select_one("a").get("href") # 取出 a href得料 article_href.append(item_href) print(article_href) # -------------------------------------------------------------- # prg#2 # ptt_movie #2 電影版爬蟲 # 取出上下一頁資料 import requests from bs4 import BeautifulSoup # 抓取 n頁資料 n = 10 url="https://www.ptt.cc/bbs/movie/index.html" for page in range(1,n+1): r = requests.get(url) soup = BeautifulSoup(r.text,"html.parser") btn = soup.select('div.btn-group > a') up_page_href = btn[3]['href'] next_page_url = 'https://www.ptt.cc' + up_page_href url = next_page_url print(url) # -------------------------------------------------------------- # prg#3 # ptt_movie #3 電影版爬蟲 # 取出上下一頁資料 # 完整程式版本 加入get_all_href import requests from bs4 import BeautifulSoup url="https://www.ptt.cc/bbs/movie/index.html" #ptt看板網址 n = 2 #需要抓取幾頁資料 # 抓取該頁所有的標題 def get_all_href(url): r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") results = soup.select("div.title") for item in results: a_item = item.select_one("a") title = item.text if a_item: print(title, 'https://www.ptt.cc'+ a_item.get('href')) # 抓取前n頁所有的連結 for page in range(1, n+1): r = requests.get(url) soup = BeautifulSoup(r.text,"html.parser") btn = soup.select('div.btn-group > a') up_page_href = btn[3]['href'] next_page_url = 'https://www.ptt.cc' + up_page_href url = next_page_url get_all_href(url = url) # -------------------------------------------------------------- # prg#4 # ptt_movie #4 電影版爬蟲 # 取出上下一頁資料 # 完整程式版本 加入get_all_href 顯示下一頁 抓取文章內容 import requests from bs4 import BeautifulSoup url="https://www.ptt.cc/bbs/movie/index.html" #ptt看板網址 n = 2 #需要抓取幾頁資料 def get_article_content(article_url): r = requests.get(article_url) soup = BeautifulSoup(r.text, "html.parser") results = soup.select('span.article-meta-value') if results: print('作者:', results[0].text) print('看板:', results[1].text) print('標題:', results[2].text) print('時間:', results[3].text) print('-------------------------') # 抓取該頁所有的標題 def get_all_href(url): r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") results = soup.select("div.title") for item in results: a_item = item.select_one("a") title = item.text if a_item: #原先程式 #print(title, 'https://www.ptt.cc'+ a_item.get('href')) #改成呼叫 get_article_content去抓取內容 get_article_content(article_url='https://www.ptt.cc'+ a_item.get('href')) print('----------------- Next Page --------------') # 抓取前n頁所有的連結 for page in range(1, n+1): r = requests.get(url) soup = BeautifulSoup(r.text,"html.parser") btn = soup.select('div.btn-group > a') up_page_href = btn[3]['href'] next_page_url = 'https://www.ptt.cc' + up_page_href url = next_page_url get_all_href(url = url)