playing with natural language ― in a friendly way
TRANSCRIPT
我是誰✓ 蔡家琦 (Tsai, Chia-Chi)
✓ ID : Rueshyna (Rues)
✓ 工作 : Ruby on Rails
✓ Machine Learning & Text Mining
斷句Today many historians think that only about twenty percent of the colonists supported Britain. Some colonists supported whichever side seemed to be winning.
-- from VOA
斷句Today many historians think that only about twenty percent of the colonists supported Britain. Some colonists supported whichever side seemed to be winning.
-- from VOA
•Today many historians think that only about twenty percent of the colonists supported Britain.
•Some colonists supported whichever side seemed to be winning.
利用“.”
斷句
Iowa-based political committee in 2007 and has grown larger since taking a leading role now against Mr. Hagel. “Postelection we have new battle lines being drawn with the president; he kicks it off with these nominations and it made sense for us.”
-- from New York Times
斷句Iowa-based political committee in 2007 and has grown larger since taking a leading role now against Mr. Hagel. “Postelection we have new battle lines being drawn with the president; he kicks it off with these nominations and it made sense for us.”
-- from New York Times利用“.”
•Iowa-based political committee in 2007 and has grown larger since taking a leading role now against Mr.
•Hagel. •“Postelection we have new battle lines being drawn with the president; he kicks it off with these nominations and it made sense for us.
•”
分詞(tokenization)
Today is a beautiful day利用空白
beautiful day.$50
for youths;
“Who knows?”
industry’s
[Today] [is] [a] [beautiful] [day]
分詞(tokenization)
Today is a beautiful day利用空白
beautiful day.$50
for youths;
“Who knows?”
industry’s?[Today] [is] [a] [beautiful] [day]
詞性(pos)
Pierre/NNPVinken/NNP61/CDyears/NNS,/, old/JJ will/MD
join/VBthe/DTboard/NNas/INa/DTnonexecutive/JJdirector/NN
Nov./NNP29/CD./.
Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
penn treebank tag
#!/usr/bin/env python
import nltkfrom urllib import urlopenurl="http://www.voanews.com/articleprintview/1587223.html"html = urlopen(url).read()raw = nltk.clean_html(html)
#nltk.download(‘punkt’)sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')sents = sent_tokenizer.tokenize(raw)
token = nltk.word_tokenize(sents[1])
#nltk.download(‘maxent_treebank_pos_tagger’)pos = nltk.pos_tag(token)
Demo1
#!/usr/bin/env python
import nltkfrom urllib import urlopenurl="http://www.voanews.com/articleprintview/1587223.html"html = urlopen(url).read()
raw = nltk.clean_html(html)
#nltk.download(‘punkt’)sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')sents = sent_tokenizer.tokenize(raw)
token = nltk.word_tokenize(sents[1])
#nltk.download(‘maxent_treebank_pos_tagger’)pos = nltk.pos_tag(token)
Demo1
清除html tag
#!/usr/bin/env python
import nltkfrom urllib import urlopenurl="http://www.voanews.com/articleprintview/1587223.html"html = urlopen(url).read()raw = nltk.clean_html(html)
#nltk.download(‘punkt’)sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(raw)
token = nltk.word_tokenize(sents[1])
#nltk.download(‘maxent_treebank_pos_tagger’)pos = nltk.pos_tag(token)
Demo1
斷句
#!/usr/bin/env python
import nltkfrom urllib import urlopenurl="http://www.voanews.com/articleprintview/1587223.html"html = urlopen(url).read()raw = nltk.clean_html(html)
#nltk.download(‘punkt’)sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')sents = sent_tokenizer.tokenize(raw)
token = nltk.word_tokenize(sents[1])
#nltk.download(‘maxent_treebank_pos_tagger’)pos = nltk.pos_tag(token)
Demo1
分詞
#!/usr/bin/env python
import nltkfrom urllib import urlopenurl="http://www.voanews.com/articleprintview/1587223.html"html = urlopen(url).read()raw = nltk.clean_html(html)
#nltk.download(‘punkt’)sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')sents = sent_tokenizer.tokenize(raw)
token = nltk.word_tokenize(sents[1])
#nltk.download(‘maxent_treebank_pos_tagger’)
pos = nltk.pos_tag(token)
Demo1
詞性標記
#!/usr/bin/env python#nltk.download(‘treebank’)
import nltkfrom nltk.corpus import treebankfrom nltk.grammar import ContextFreeGrammar, Nonterminalfrom nltk.parse import ChartParser
productions = set( production for sent in treebank.parsed_sents()[0:9] for production in sent.productions())
grammar = ContextFreeGrammar(Nonterminal('S'),productions)
parser = ChartParser(grammar)
parsed_tree = parser.parse(treebank.sents()[0])# print parsed_tree
Demo2
#!/usr/bin/env python#nltk.download(‘treebank’)
import nltkfrom nltk.corpus import treebankfrom nltk.grammar import ContextFreeGrammar, Nonterminalfrom nltk.parse import ChartParser
productions = set( production for sent in treebank.parsed_sents()[0:9] for production in sent.productions())
grammar = ContextFreeGrammar(Nonterminal('S'),productions)
parser = ChartParser(grammar)
parsed_tree = parser.parse(treebank.sents()[0])# print parsed_tree
Demo2
treebank production
#!/usr/bin/env python#nltk.download(‘treebank’)
import nltkfrom nltk.corpus import treebankfrom nltk.grammar import ContextFreeGrammar, Nonterminalfrom nltk.parse import ChartParser
productions = set( production for sent in treebank.parsed_sents()[0:9] for production in sent.productions())
grammar = ContextFreeGrammar(Nonterminal('S'),productions)
parser = ChartParser(grammar)
parsed_tree = parser.parse(treebank.sents()[0])# print parsed_tree
Demo2
encoder grammar
#!/usr/bin/env python#nltk.download(‘treebank’)
import nltkfrom nltk.corpus import treebankfrom nltk.grammar import ContextFreeGrammar, Nonterminalfrom nltk.parse import ChartParser
productions = set( production for sent in treebank.parsed_sents()[0:9] for production in sent.productions())
grammar = ContextFreeGrammar(Nonterminal('S'),productions)
parser = ChartParser(grammar)
parsed_tree = parser.parse(treebank.sents()[0])# print parsed_tree
Demo2
產生parser
#!/usr/bin/env python#nltk.download(‘reuters’)
import nltkfrom nltk.probability import FreqDistfrom nltk.probability import ConditionalFreqDistfrom nltk.corpus import reutersfrom nltk.corpus import brown
fd = FreqDist(map(lambda w : w.lower(), brown.words()[0:50]))#fd.tabulate(10)#fd.plot()
cdf = ConditionalFreqDist((corpus, word) for corpus in ['reuters', 'brown'] for word in eval(corpus).words() if word in map(str,range(1900,1950,5)))#cdf.conditions()#cdf['brown']['1910']#cdf.tabulate()#cdf.plot()
Demo3
#!/usr/bin/env python#nltk.download(‘reuters’)
import nltkfrom nltk.probability import FreqDistfrom nltk.probability import ConditionalFreqDistfrom nltk.corpus import reutersfrom nltk.corpus import brown
fd = FreqDist(map(lambda w : w.lower(), brown.words()[0:50]))#fd.tabulate(10)#fd.plot()
cdf = ConditionalFreqDist((corpus, word) for corpus in ['reuters', 'brown'] for word in eval(corpus).words() if word in map(str,range(1900,1950,5)))#cdf.conditions()#cdf['brown']['1910']#cdf.tabulate()#cdf.plot()
Demo3
詞頻統計
#!/usr/bin/env python#nltk.download(‘reuters’)
import nltkfrom nltk.probability import FreqDistfrom nltk.probability import ConditionalFreqDistfrom nltk.corpus import reutersfrom nltk.corpus import brown
fd = FreqDist(map(lambda w : w.lower(), brown.words()[0:50]))#fd.tabulate(10)#fd.plot()
cdf = ConditionalFreqDist((corpus, word) for corpus in ['reuters', 'brown'] for word in eval(corpus).words() if word in map(str,range(1900,1950,5)))#cdf.conditions()#cdf['brown']['1910']#cdf.tabulate()#cdf.plot()
Demo3
不同條件的詞頻統計(Conditions and Events)