๐ ๊ณต๋ถํ๋ ์ง์ง์ํ์นด๋ ์ฒ์์ด์ง?
3. NLTK๋ก ํ ์คํธ ์์ฝํ๊ธฐ – “ํต์ฌ ๋ฌธ์ฅ์ ๋ฝ์๋ด๊ณ ๋จ์ด ๊ตฌ๋ฆ์ ๋ง๋ค์ด๋ณด์” ๋ณธ๋ฌธ
๐ฉ๐ป ์ธ๊ณต์ง๋ฅ (ML & DL)/ML & DL
3. NLTK๋ก ํ ์คํธ ์์ฝํ๊ธฐ – “ํต์ฌ ๋ฌธ์ฅ์ ๋ฝ์๋ด๊ณ ๋จ์ด ๊ตฌ๋ฆ์ ๋ง๋ค์ด๋ณด์”
์ง์ง์ํ์นด 2022. 10. 17. 15:59728x90
๋ฐ์ํ
221017 ์์ฑ
<๋ณธ ๋ธ๋ก๊ทธ๋์ค์ ํ์ด์ฌ ํธ์ฆ์จ ํ๋ก์ ํธ์ github ๋ฅผ ์ฐธ๊ณ ํด์ ๊ณต๋ถํ๋ฉฐ ์์ฑํ์์ต๋๋ค>
https://www.onlybook.co.kr/m/entry/python-projects
์ค์ ํ์ด์ฌ ํธ์ฆ์จ ํ๋ก์ ํธ
์ค์ ํ์ด์ฌ ํธ์ฆ์จ ํ๋ก์ ํธ ๋ฌธ์ ํด๊ฒฐ๊ณผ ์ค๋ฌด ์์ฉ๋ ฅ์ ํค์ฐ๊ธฐ ์ํ ๋๋ง์ ํ์ด์ฌ ํฌํธํด๋ฆฌ์ค ๋ง๋ค๊ธฐ ๋ฆฌ ๋ณธ ์ง์ | ์คํ์ ์ฎ๊น 420์ชฝ | 28,000์ | 2022๋ 5์ 31์ผ ์ถ๊ฐ | 185*240*20 | ISBN13 9791189909406
www.onlybook.co.kr
๐ NLTK๋ก ํ
์คํธ ์์ฝํ๊ธฐ
: ๋งํด ๋ฃจํฐ์ ‘๋์๊ฒ๋ ๊ฟ์ด ์์ต๋๋ค’์ ๊ฐ์ ์ ๋ช
ํ ์ฐ์ค์ ์ธํฐ๋ท์์ ๊ธ์ด์์ ์์ ์ ์์ฝ ์์ค ๋ณธ๋ฌธ์ ๋ฉ์ง ๊ด๊ณ ๋ ํ์ด ๊ธ๋ก ๋ณํ
: BeautifulSoup, Requests, regex, NLTK, Collections, wordcloud, matplotlib ๋ฑ์ ํ์ฉ
"""
To run this program install Gensim 3.8.3 (https://pypi.org/project/gensim/3.8.3/)
"""
from collections import Counter
import re
import requests
import bs4
import nltk
from nltk.corpus import stopwords
# ์น์คํฌ๋ํ์ ์ฌ์ฉํ์ฌ ํ
์คํธ๋ฅผ ์ป๋๋ค
def main():
# Use webscraping to obtain the text.
url = 'http://www.analytictech.com/mb021/mlk.htm'
page = requests.get(url)
page.raise_for_status()
soup = bs4.BeautifulSoup(page.text, 'html.parser')
p_elems = [element.text for element in soup.find_all('p')]
speech = ' '.join(p_elems) # Make sure to join on a space!
# ์คํ๋ฅผ ์์ ํ๊ณ ์ถ๊ฐ ๊ณต๋ฐฑ, ์ซ์ ๋ฐ ๊ตฌ๋์ ์ ์ ๊ฑฐ
speech = speech.replace(')mowing', 'knowing')
speech = re.sub('\s+', ' ', speech)
speech_edit = re.sub('[^a-zA-Z]', ' ', speech)
speech_edit = re.sub('\s+', ' ', speech_edit)
# Request input.
while True:
max_words = input("Enter max words per sentence for summary: ")
num_sents = input("Enter number of sentences for summary: ")
if max_words.isdigit() and num_sents.isdigit():
break
else:
print("\nInput must be in whole numbers.\n")
# ๋ฌธ์ฅ ์ ์๋ฅผ ์์ฑํ๋ ํจ์๋ฅผ ์คํ
speech_edit_no_stop = remove_stop_words(speech_edit)
word_freq = get_word_freq(speech_edit_no_stop)
sent_scores = score_sentences(speech, word_freq, max_words)
# ์ต์์ ๋ฌธ์ฅ์ ์ถ๋ ฅ
counts = Counter(sent_scores)
summary = counts.most_common(int(num_sents))
print("\nSUMMARY:")
for i in summary:
print(i[0])
# """๋ฌธ์์ด์์ ์ค์ง ๋จ์ด๋ฅผ ์ ๊ฑฐํ๊ณ ๋ฌธ์์ด์ ๋ฐํ"""
def remove_stop_words(speech_edit):
"""Remove stop words from string and return string."""
stop_words = set(stopwords.words('english'))
speech_edit_no_stop = ''
for word in nltk.word_tokenize(speech_edit):
if word.lower() not in stop_words:
speech_edit_no_stop += word + ' '
return speech_edit_no_stop
# """๋ฌธ์์ด์์ ๋จ์ด ๋น๋ ์ฌ์ ์ ๋ฐํ"""
def get_word_freq(speech_edit_no_stop):
"""Return a dictionary of word frequency in a string."""
word_freq = nltk.FreqDist(nltk.word_tokenize(speech_edit_no_stop.lower()))
return word_freq
# """๋จ์ด ๋น๋์ ๋ฐ๋ผ ๋ฌธ์ฅ ์ ์์ ์ฌ์ ์ ๋ฐํ"""
def score_sentences(speech, word_freq, max_words):
"""Return dictionary of sentence scores based on word frequency."""
sent_scores = dict()
# ๋ฌธ์ฅ ํ ํฐํ
sentences = nltk.sent_tokenize(speech)
for sent in sentences:
sent_scores[sent] = 0
words = nltk.word_tokenize(sent.lower())
sent_word_count = len(words)
if sent_word_count <= int(max_words):
for word in words:
if word in word_freq.keys():
sent_scores[sent] += word_freq[word]
sent_scores[sent] = sent_scores[sent] / sent_word_count
return sent_scores
if __name__ == '__main__':
main()
728x90
๋ฐ์ํ
'๐ฉโ๐ป ์ธ๊ณต์ง๋ฅ (ML & DL) > ML & DL' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
Comments