#####################################################
# Jieba三種斷詞法

# Jieba1
#encoding=utf-8
import jieba

seg_list = jieba.cut("我來到台中教育大學",cut_all=True)
print ("Full Mode:", "/ ".join(seg_list)) #全模式

seg_list = jieba.cut("我來到台中教育大學",cut_all=False)
print ("Default Mode:", "/ ".join(seg_list)) #精確模式

seg_list = jieba.cut("我來到台中教育大學") #默認是精確模式
print (", ".join(seg_list))

seg_list = jieba.cut_for_search("志明碩士畢業於台中教育大學，後在日本東京大學深造") #搜索引擎模式
print (", ".join(seg_list))


sentence = "我來到台中教育大學就讀碩士"
# 預設模式斷詞
breakword = jieba.cut(sentence, cut_all=False)
print("預設模式:" + '|' . join(breakword))

# 全文模式斷詞
breakword = jieba.cut(sentence, cut_all=True)
print("全文模式:" + '|' . join(breakword))

# 搜尋引擎模式斷詞
breakword = jieba.cut_for_search(sentence)
print("搜尋引擎:" + '|' . join(breakword))

#-------------------------------------------------
Output:
預設模式:我來|到|台|中|教育|大學
全文模式:我|來|到|台中|教育|大|學
搜尋引擎:我來|到|台|中|教育|大學

#####################################################

# Jieba2  設定繁體中文詞庫
#encoding=utf-8
import jieba

# 設定繁體中文詞庫
jieba.set_dictionary('dictionary/dict.txt.big.txt')  

sentence = "我來到台中教育大學就讀碩士"
# 預設模式斷詞
breakword = jieba.cut(sentence, cut_all=False)
print("預設模式:" + '|' . join(breakword))

# 全文模式斷詞
breakword = jieba.cut(sentence, cut_all=True)
print("全文模式:" + '|' . join(breakword))

# 搜尋引擎模式斷詞
breakword = jieba.cut_for_search(sentence)
print("搜尋引擎:" + '|' . join(breakword))

#-------------------------------------------------
Output:
預設模式:我|來到|台|中|教育|大學
全文模式:我|來到|台中|教育|大學
搜尋引擎:我|來到|台|中|教育|大學
#-------------------------------------------------
修改後
Output:
預設模式:我|來到|台中教育大學
全文模式:我|來到|台中|台中教育大學|教育|大學
搜尋引擎:我|來到|台中|教育|大學|台中教育大學

#####################################################

# Jieba3 增加自定義的詞
#encoding=utf-8
import jieba

# 設定繁體中文詞庫
jieba.set_dictionary('dictionary/dict.txt.big.txt')  

sentence = "吳智鴻來到台中教育大學數位系就讀碩士"

#jieba.add_word('數位系')
#jieba.add_word('凱特琳')
#jieba.del_word('自定义词')

# 預設模式斷詞
breakword = jieba.cut(sentence, cut_all=False)
print("預設模式:" + '|' . join(breakword))

# 全文模式斷詞
breakword = jieba.cut(sentence, cut_all=True)
print("全文模式:" + '|' . join(breakword))

# 搜尋引擎模式斷詞
breakword = jieba.cut_for_search(sentence)
print("搜尋引擎:" + '|' . join(breakword))

#-------------------------------------------------
Output:
預設模式:吳智鴻|來到|台中教育大學|數位|系|就讀|碩士
全文模式:吳|智|鴻|來到|台中|台中教育大學|教育|大學|數位|系|就讀|碩士
搜尋引擎:吳智鴻|來到|台中|教育|大學|台中教育大學|數位|系|就讀|碩士
#-------------------------------------------------
#-------------------------------------------------
修改後
Output:
預設模式:吳智鴻|來到|台中教育大學|數位系|就讀|碩士
全文模式:吳|智|鴻|來到|台中|台中教育大學|教育|大學|數位|數位系|就讀|碩士
搜尋引擎:吳智鴻|來到|台中|教育|大學|台中教育大學|數位|數位系|就讀|碩士


#####################################################
#encoding=utf-8
import jieba

# 設定繁體中文詞庫
jieba.set_dictionary('dictionary/dict.txt.big.txt')  

# 增加自定義停用詞
jieba.load_userdict('dictionary/user_dict.txt')


sentence = "吳智鴻，來到國立臺中教育大學數位系就讀碩士。"

jieba.add_word('數位系')
#jieba.add_word('凱特琳')
#jieba.del_word('自定义词')

# 預設模式斷詞
breakword = jieba.cut(sentence, cut_all=False)
print("預設模式:" + '|' . join(breakword))

# 全文模式斷詞
breakword = jieba.cut(sentence, cut_all=True)
print("全文模式:" + '|' . join(breakword))

# 搜尋引擎模式斷詞
breakword = jieba.cut_for_search(sentence)
print("搜尋引擎:" + '|' . join(breakword))
#-------------------------------------------------
Output:
預設模式:吳智鴻|，|來到|國立臺中教育大學|數位系|就讀|碩士|。
全文模式:吳智鴻|||來到|國立|國立臺中教育大學|臺中|教育|大學|數位|數位系|就讀|碩士||
搜尋引擎:吳智鴻|，|來到|國立|臺中|教育|大學|國立臺中教育大學|數位|數位系|就讀|碩士|。
#####################################################

# Jieba5 增加停用字
#encoding=utf-8
import jieba

# 設定繁體中文詞庫
jieba.set_dictionary('dictionary/dict.txt.big.txt')  

# 增加自定義停用詞
jieba.load_userdict('dictionary/user_dict.txt')

# 打開停用字詞典
with open('dictionary/stopword.txt', 'r', encoding='utf-8-sig') as file:
    stops = file.read().split('\n')  # 將停用詞儲存在stops串列中
print("停用詞:"+'|' . join(stops))
    
sentence = "吳智鴻，來到國立臺中教育大學數位系就讀碩士。"

#jieba.add_word('數位系')
#jieba.add_word('凱特琳')
#jieba.del_word('自定义词')

# 預設模式斷詞
breakword = jieba.cut(sentence, cut_all=False)
final_words = []   #儲存最後的詞
#拆解句子為字詞
for word in breakword:     # 拆解句子為字詞
    if word not in stops:  #不是停用詞
        final_words.append(word)
print("去除停用:" + '|' . join(final_words))

breakword = jieba.cut(sentence, cut_all=False)
print("預設模式:" + '|' . join(breakword))    

# 全文模式斷詞
breakword = jieba.cut(sentence, cut_all=True)
print("全文模式:" + '|' . join(breakword))

# 搜尋引擎模式斷詞
breakword = jieba.cut_for_search(sentence)
print("搜尋引擎:" + '|' . join(breakword))
#-------------------------------------------------
Output:
停用詞:"|;|,|，|。|來到
去除停用:吳智鴻|國立臺中教育大學|數位系|就讀|碩士
預設模式:吳智鴻|，|來到|國立臺中教育大學|數位系|就讀|碩士|。
全文模式:吳智鴻|||來到|國立|國立臺中教育大學|臺中|教育|大學|數位|數位系|就讀|碩士||
搜尋引擎:吳智鴻|，|來到|國立|臺中|教育|大學|國立臺中教育大學|數位|數位系|就讀|碩士|。

#####################################################
# WordCloud1  文字雲

from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import numpy as np
from collections import Counter

text = open('news1.txt', "r",encoding="utf-8").read()  #讀文字資料
 
jieba.set_dictionary('dictionary/dict.txt.big.txt')
with open('dictionary/stopWord_cloud.txt', 'r', encoding='utf-8-sig') as f:  #設定停用詞
#with open('dictionary/stopWord_cloudmod.txt', 'r', encoding='utf-8-sig') as f:  #設定停用詞
    stops = f.read().split('\n')   
terms = []  #儲存字詞
for t in jieba.cut(text, cut_all=False):  #拆解句子為字詞
    if t not in stops:  #不是停用詞
        terms.append(t)
diction = Counter(terms)
# 可列印詞的統計數量
#print(diction)

#font = "C:\\Windows\\Fonts\\simsun.ttc"  #設定字型(宋體)
font = "C:\\Windows\\Fonts\\MSJH.ttf"  #設定字型(宋體)
#wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\simsun.ttc")

mask = np.array(Image.open("heart.png"))  #設定文字雲形狀 
#wordcloud = WordCloud(font_path=font) 
wordcloud = WordCloud(background_color="white",mask=mask, font_path=font)  #背景顏色預設黑色,改為白色,字體為宋體
wordcloud.generate_from_frequencies(diction)  #產生文字雲

#產生圖片
plt.figure(figsize=(6,6))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

wordcloud.to_file("news_Wordcloud.png")  #存檔

#####################################################



# NewsCloud3   文字雲結合google search

import requests
from bs4 import BeautifulSoup

# Google 搜尋 URL
google_url = 'https://www.google.com.tw/search'

# 查詢參數
my_params = {'q': '寒流'}

# 下載 Google 搜尋結果
r = requests.get(google_url, params = my_params)

# 確認是否下載成功
if r.status_code == requests.codes.ok:
  # 以 BeautifulSoup 解析 HTML 原始碼
  soup = BeautifulSoup(r.text, 'html.parser')

  # 觀察 HTML 原始碼
  #print(soup.prettify())

  # 以 CSS 的選擇器來抓取 Google 的搜尋結果
  items = soup.select('div.kCrYT > a[href^="/url"]')
 
  for i in items:
    # 標題
    print("標題：" + i.text)
    # 網址
    print("網址：" + i.get('href'))
#-----------------------------------------------

from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import numpy as np
from collections import Counter

#讀取文字檔資料
#text = open('news1.txt', "r",encoding="utf-8").read()  #讀文字資料

#讀取Google資料

text = ''
for i in items:
    # 標題
    text = text + i.text
print( text)

jieba.set_dictionary('dictionary/dict.txt.big.txt')
with open('dictionary/stopWord_cloud.txt', 'r', encoding='utf-8-sig') as f:  #設定停用詞
#with open('dictionary/stopWord_cloudmod.txt', 'r', encoding='utf-8-sig') as f:  #設定停用詞
    stops = f.read().split('\n')   
terms = []  #儲存字詞
for t in jieba.cut(text, cut_all=False):  #拆解句子為字詞
    if t not in stops:  #不是停用詞
        terms.append(t)
diction = Counter(terms)
# 可列印詞的統計數量
#print(diction)

#font = "C:\\Windows\\Fonts\\simsun.ttc"  #設定字型(宋體)
font = "C:\\Windows\\Fonts\\MSJH.ttf"  #設定字型(宋體)
#wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\simsun.ttc")

mask = np.array(Image.open("heart.png"))  #設定文字雲形狀 
#wordcloud = WordCloud(font_path=font) 
wordcloud = WordCloud(background_color="white",mask=mask, font_path=font)  #背景顏色預設黑色,改為白色,字體為宋體
wordcloud.generate_from_frequencies(diction)  #產生文字雲

#產生圖片
plt.figure(figsize=(6,6))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

wordcloud.to_file("news_Wordcloud.png")  #存檔
#######################################################################






# summary1.py   (讀取文字檔並產生文章自動摘要)
#
# 需要檔案 AutoSummary.py, dictionary/stopWord_summar.txt,   issue1.txt
#           
import AutoSummary as ausu

content = 'issue1.txt'  
with open(content, 'r', encoding='utf8') as f:  #讀取原始文章
    text = f.read()

stops = []
with open('dictionary/stopWord_summar.txt','r', encoding='utf8') as f:  #停用詞庫
    for line in f.readlines():
        stops.append(line.strip())

sentences,indexs = ausu.split_sentence(text)  #按標點分割句子
tfidf = ausu.get_tfidf_matrix(sentences,stops)  #移除停用詞並轉換為矩陣
word_weight = ausu.get_sentence_with_words_weight(tfidf)  #計算句子關鍵詞權重
posi_weight = ausu.get_sentence_with_position_weight(sentences)  #計算位置權重
scores = ausu.get_similarity_weight(tfidf)  #計算相似度權重
sort_weight = ausu.ranking_base_on_weigth(word_weight, posi_weight, scores, feature_weight = [1,1,1])  #按句子權重排序
summar = ausu.get_summarization(indexs,sort_weight,topK_ratio = 0.1)  #取得摘要
print('原文:\n', text)
print('==========================================================')
print('摘要:\n',summar)

#-------------------------------------------------
Output:
摘要:
 ﻿台南出生的黃仁勳，25 年前在美國創辦的 NVIDIA，近年來屢創高峰，股價成長超過 700％、營收數字成長 94％。」

台灣時間雙十國慶的傍晚， 在德國慕尼黑，NVIDIA（輝達）執行長黃仁勳（JensenHuang）， 正準備上台發表 2018 第三場的全球 GPU 技術大會（GPU TechnologyConference，簡稱 GTC）。

黃仁勳也談到醫療的應用，尤其是在許多生物影像的判讀上，人工智慧可以讓醫師的工作變得更省力卻更精準；最後，他強調虛擬實境（VR）已跳脫遊戲的範疇，營建業可用來創造建築實景，車廠也可以拿來設計超跑。

從 2009 年開始舉辦的 GTC 全球巡迴，迄今已成為觀察 NVIDIA 發展情況的最重要機會，而黃仁勳正是其中最大的亮點。（見第 76 頁）其實今年 5 月，黃仁勳已被美國財經雜誌《巴隆》（BARRON'S ）選為全球三十位最佳執行長，與亞馬遜（Amazon） 執行長貝佐斯（Jeff Bezos）、臉書（Facebook） 創辦人祖克伯（Mark Zuckerberg）、波克夏（Berkshire Hathaway） 創辦人巴菲特（Warren Buffett）等人共列「具有遠見的創辦人」。

成長動力：符合人工智慧需求的 GPU 運算能力

非資通訊領域的台灣人，或許對 NVIDIA 不甚了解。

國際研究暨顧問機構顧能（Gartner） 資深研究總監，有 37 年半導體和電子產業研究相關經驗的艾倫．普萊耶斯特利（Alan Priestley） 表示，NVIDIA 之所以能屢創新高，正是由於電競和強調高速運算的資訊需求（例如 AI）在近年大幅成長。

在《富比士》（Forbes ）對黃仁勳的報導中，引述分析師的評論，黃仁勳是一位非常講究效率的領導人，他有很明確的願景，而且清楚了解 NVIDIA 的平行運算處理能力，可以解決未來五到十年內最困難的計算挑戰。



#####################################################

# summary_union.py   (讀取聯合新聞網並產生文章自動摘要)
#
# 需要檔案 AutoSummary.py, dictionary/stopWord_summar.txt
#     
import AutoSummary as ausu
import requests
from bs4 import BeautifulSoup as soup

stops = []
with open('dictionary/stopWord_summar.txt','r', encoding='utf8') as f:  #停用詞庫
    for line in f.readlines():
        stops.append(line.strip())

urls = []
url = 'https://udn.com/news/breaknews/1'  #聯合報新聞
html = requests.get(url)
sp = soup(html.text, 'html.parser')
data1 = sp.select('#breaknews_body dl dt h2 a')
for d in data1:  #取得新聞連結
    urls.append('https://udn.com' + d.get('href'))

i = 1
for url in urls:  #逐一取得新聞
    html = requests.get(url)
    sp = soup(html.text, 'html.parser')
    data1 = sp.select('#story_body_content p')  #新聞內容
    print('處理第 {} 則新聞'.format(i))
    text = ''
    for d in data1:
        if d.text.find('延伸閱讀') != -1:  #遇到延伸閱讀就結束此則新聞
            break
        if d.text != '':  #有新聞內容
            text += d.text
    sentences,indexs = ausu.split_sentence(text)  #按標點分割句子
    tfidf = ausu.get_tfidf_matrix(sentences,stops)  #移除停用詞並轉換為矩陣
    word_weight = ausu.get_sentence_with_words_weight(tfidf)  #計算句子關鍵詞權重
    posi_weight = ausu.get_sentence_with_position_weight(sentences)  #計算位置權重
    scores = ausu.get_similarity_weight(tfidf)  #計算相似度權重
    sort_weight = ausu.ranking_base_on_weigth(word_weight, posi_weight, scores, feature_weight = [1,1,1])
    summar = ausu.get_summarization(indexs,sort_weight,topK_ratio = 0.3)  #取得摘要
    print(summar)
    print('==========================================================')
    i += 1

#-------------------------------------------------
Output:
處理第 1 則新聞
Loading model cost 0.778 seconds.
Prefix dict has been built succesfully.
保險局預計11月將公布明年壽險業各幣別新契約責任準備金利率，但今天財委會上調降幅度提前曝光，預計新台幣、美元、人民幣保單責任準備金利率將調降1碼，10年期以上澳幣保單將調降2碼。顧立雄表示，保險公司每年保費增加幅度都在2兆多，現在之所以調降責任準備金利率，是因為各幣別公債市場殖利率都較去年下降，因此責任準備金利率在新台幣跟美元都較去年調降一碼，大家也都同意要調降。2019年10月21日 - 2019年11月20日活動期間您可獲得活動金幣 3,000 枚，當您看到優質新聞，即可點按文章中的「贊助好新聞」按鈕贊助該篇文章，且可隨時至會員中心查詢目前金幣的使用狀況。越多天登入贊助，中獎機率越高點按文章中的「贊助好新聞」，以活動金幣贊助該篇文章，支持心中優質新聞。據保險局的規劃，明年新台幣、美元、人民幣保單責任準備金都將調降1碼、10年期以上的澳幣保單預計調降最多達2碼。
==========================================================
處理第 2 則新聞
10月是國際反霸凌月，教育部最近在官方臉書粉絲專頁秀出八件中學制服，上面所繡的名字包括「娘炮」、「太平洋」、「高額頭」、「自以為混血」、「那個沒屌的」等，引起網友熱議。教育部也邀八名知名網紅合作，請他們分享過去被霸凌的經驗，今天將在官方臉書上釋出。2019年10月21日 - 2019年11月20日活動期間您可獲得活動金幣 3,000 枚，當您看到優質新聞，即可點按文章中的「贊助好新聞」按鈕贊助該篇文章，且可隨時至會員中心查詢目前金幣的使用狀況。越多天登入贊助，中獎機率越高點按文章中的「贊助好新聞」，以活動金幣贊助該篇文章，支持心中優質新聞。
==========================================================