4. ์ํธํ ๊ธฐ๋ฒ์ผ๋ก ์์ ํ ๋ฉ์์ง ์ ์กํ๊ธฐ – “ํด๋ ๋ถ๊ฐ๋ฅํ ์ํธ๋ฌธ์ ์์ฑํด๋ณด์”
221020 ์์ฑ
<๋ณธ ๋ธ๋ก๊ทธ๋์ค์ ํ์ด์ฌ ํธ์ฆ์จ ํ๋ก์ ํธ์ github ๋ฅผ ์ฐธ๊ณ ํด์ ๊ณต๋ถํ๋ฉฐ ์์ฑํ์์ต๋๋ค>
https://www.onlybook.co.kr/m/entry/python-projects
์ค์ ํ์ด์ฌ ํธ์ฆ์จ ํ๋ก์ ํธ
์ค์ ํ์ด์ฌ ํธ์ฆ์จ ํ๋ก์ ํธ ๋ฌธ์ ํด๊ฒฐ๊ณผ ์ค๋ฌด ์์ฉ๋ ฅ์ ํค์ฐ๊ธฐ ์ํ ๋๋ง์ ํ์ด์ฌ ํฌํธํด๋ฆฌ์ค ๋ง๋ค๊ธฐ ๋ฆฌ ๋ณธ ์ง์ | ์คํ์ ์ฎ๊น 420์ชฝ | 28,000์ | 2022๋ 5์ 31์ผ ์ถ๊ฐ | 185*240*20 | ISBN13 9791.
www.onlybook.co.kr
https://github.com/rlvaugh/Real_World_Python
GitHub - rlvaugh/Real_World_Python: Code and supporting files for book Real World Python
Code and supporting files for book Real World Python - GitHub - rlvaugh/Real_World_Python: Code and supporting files for book Real World Python
github.com
๐ค NLTK๋ก ์ํธ๋ฌธ ์์ฑํ๊ธฐ
: ์ผ ํด๋ฆฟ์ ๋ฒ ์คํธ์ ๋ฌ ์คํ์ด ์์ค์ธ ใ๋ ๋ฒ ์นด์ ์ด์ ใ์ ๋์ค๋ ์ํ์ ํจ๋ ๋ฐฉ์์ ๋์งํธ ๋ฐฉ์์ผ๋ก ์ฌ๊ตฌ์ฑํด์, ์๋ฌด๋ ๊นฐ ์ ์๋ ์ํธ๋ฌธ์ ์ฌ๋ฌ๋ถ์ ์น๊ตฌ์ ํจ๊ป ๊ณต์ ํ๋ค.
: Collections ๋ชจ๋์ ํ์ฉ
from collections import Counter
import re
import requests
import bs4
import nltk
from nltk.corpus import stopwords
def main():
# ์น์คํฌ๋ํ์ ์ฌ์ฉํ์ฌ ํ
์คํธ๋ฅผ ์ป๊ธฐ
url = 'http://www.analytictech.com/mb021/mlk.htm'
page = requests.get(url)
page.raise_for_status()
# HTML, XML, HTML5 ๋ฑ ๋ฐ์ดํฐ ์ถ์ถ์ ์ฌ์ฉ
soup = bs4.BeautifulSoup(page.text, 'html.parser')
p_elems = [element.text for element in soup.find_all('p')]
speech = ' '.join(p_elems) # Make sure to join on a space!
# ์คํ๋ฅผ ์์ ํ๊ณ ์ถ๊ฐ ๊ณต๋ฐฑ, ์ซ์ ๋ฐ ๊ตฌ๋์ ์ ์ ๊ฑฐ
speech = speech.replace(')mowing', 'knowing')
speech = re.sub('\s+', ' ', speech)
speech_edit = re.sub('[^a-zA-Z]', ' ', speech)
speech_edit = re.sub('\s+', ' ', speech_edit)
# ์
๋ ฅ์ ์์ฒญ
while True:
max_words = input("Enter max words per sentence for summary: ")
num_sents = input("Enter number of sentences for summary: ")
if max_words.isdigit() and num_sents.isdigit():
break
else:
print("\nInput must be in whole numbers.\n")
# ๋ฌธ์ฅ ์ ์๋ฅผ ์์ฑํ๋ ํจ์๋ฅผ ์คํ
speech_edit_no_stop = remove_stop_words(speech_edit)
word_freq = get_word_freq(speech_edit_no_stop)
sent_scores = score_sentences(speech, word_freq, max_words)
# ์ต์์ ๋ฌธ์ฅ์ ์ถ๋ ฅ
counts = Counter(sent_scores)
summary = counts.most_common(int(num_sents))
print("\nSUMMARY:")
for i in summary:
print(i[0])
# """๋ฌธ์์ด์์ ์ค์ง ๋จ์ด๋ฅผ ์ ๊ฑฐํ๊ณ ๋ฌธ์์ด์ ๋ฐํ"""
def remove_stop_words(speech_edit):
stop_words = set(stopwords.words('english'))
speech_edit_no_stop = ''
for word in nltk.word_tokenize(speech_edit):
if word.lower() not in stop_words:
speech_edit_no_stop += word + ' '
return speech_edit_no_stop
# """๋ฌธ์์ด์์ ๋จ์ด ๋น๋ ์ฌ์ ์ ๋ฐํ"""
def get_word_freq(speech_edit_no_stop):
word_freq = nltk.FreqDist(nltk.word_tokenize(speech_edit_no_stop.lower()))
return word_freq
# """๋จ์ด ๋น๋์ ๋ฐ๋ผ ๋ฌธ์ฅ ์ ์์ ์ฌ์ ์ ๋ฐํ"""
def score_sentences(speech, word_freq, max_words):
sent_scores = dict()
sentences = nltk.sent_tokenize(speech)
for sent in sentences:
sent_scores[sent] = 0
words = nltk.word_tokenize(sent.lower())
sent_word_count = len(words)
if sent_word_count <= int(max_words):
for word in words:
if word in word_freq.keys():
sent_scores[sent] += word_freq[word]
sent_scores[sent] = sent_scores[sent] / sent_word_count
return sent_scores
if __name__ == '__main__':
main()