0'12*)$%-./'34'5#267+-52'/-028' - lee...

22


Upload: doancong

Post on 23-Apr-2018

218 views

Category:

Documents


1 download

TRANSCRIPT

Page 1: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

!"#$%&'()*+,-./'*.0'12*)$%-./'34'5#267+-52'/"-028'

9:;2$#-<2&''

• !"#$%#&'(')#(*+'%$',-"'./0"1'2%&(/1'3"4.4'-,,567784/9,459:4"2:'*-"#"'8'&"(19'(1;'9"<:"1)"'%$'

)-(#(),"#9=4'>#(*+':9:(++;'9,(#,'*/,-'('3%#'&(1;='9""2'?@A'*-/)-'*/++'B"'$",)-"2'$/#9,'(12',-"'

%:,+/1C9'$#%&',-/9'5(."'*/++'B"'$",)-"2'1"D,4'

• E",':5'('*"B'9"#0"#'*/,-'9"#0+",')%1,(/1"#'3,%&)(,'/1',-/9')(9"='(12'5"#$%#&',-"'9"(#)-'

%5"#(,/%1':5%1',-"')#(*+"2'5(."94'

455"=6#->.5&'

• F'$(&/+/(#/,;'*/,-',-"'B(9/)'+/1:D')%&&(129'(12'('+/1:D',"D,'"2/,%#'3@"<:/#"2')%&&(129'(#"'

)%0"#"2'(9'*"'.%'(+%1.'*/,-',-/9'2%):&"1,G'-%*"0"#G',-"'&%#"';%:'C1%*',-"'B",,"#',-(,'*%:+2'

B"4='

• H(0(G'I:,)-G',%&)(,'/19,(++"2'%1',-"'9;9,"&4'3J"'-(0"'(++'/1'%:#')(9"4='

A",K9'L"./16'

77.92(9'(22"2'

M",'EEN')+/"1,'%#'5:,,;'$#%&'-,,56772%*1+%(294/,9459:4"2:'

E5")/$;'/19,(++(,/%1'$%+2"#'(9'2"9C,%5'/$';%:'2%1K,'-(0"',-"'#"<:/#"2'5"#&/99/%19'

77"12'.92(9'

1#26?&'@>/-.'*.0'A.>+-./'B>")'+%2)2*:>"#5'>.'#%2'5B5#2='

A%./1',%',-"'E;9,"&':9/1.'99-4'O1'*/12%*9G',-"'9-"++'*/12%*'9-%:+2'(55"(#'+/C"6'

Page 2: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

'

'

>+/)C'%1'P<:/)C')%11"),QG',-"'$%++%*/1.'*/12%*'9-%:+2'(55"(#4'R;5"',-"'2",(/+9'%$';%:#'+%./1'/1',-"'

(55#%5#/(,"'B%D'3S'(&'(99:&/1.',"(&'1:&B"#'TU'-"#"4='

'

O1)"';%:'(#"'+%.."2'/1G'6+0'')%&&(12'39,(129'$%#'P5#"9"1,'*%#C/1.'2/#"),%#;Q='9-%:+2'9-%*';%:'*-"#"'

;%:'(#"4'V4.G':5%1'+%../1.'*/,-',"(&TU':9"#1(&"G'/,'+%%C9'+/C"',-/96'

Page 3: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

'

'

'

'

P+9Q')%&&(12'9-%:+2'9-%*';%:'(++',-"'$/+"9'(12'2/#"),%#/"9'/1',-"'*%#C/1.'2/#"),%#;4'S1',-"',"(&TU'

2/#"),%#;'/,'+%%C9'+/C"6'

Page 4: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

E%G'*"'-(0"'1:,)-'(12',%&)(,'/1',-"',"(&TUK9'-%&"'2/#"),%#;4'S1'%:#')(9"G'B%,-'%$',-"'1:,)-'(12'

,%&)(,'(#"'(+#"(2;'/19,(++"24'E%'*"'W:9,'1""2',%')%1$/.:#"',-"9"',*%',%%+9'(12'*"'(#"'#"(2;',%'.%4'

CC/50*5'*0020'

DE'."#$%'-5'.>#'*,)2*0B'*<*-,*:,2'-.'B>")'%>=2'E>,02)5F'$>6B'-#'E)>='C"5)C,>$*,'>.'-5#GG?'

'

CC2.0'/50*5'

'

1#26H&'(>.E-/")-./'."#$%'E>)'$)*+,''

Page 5: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

X/#9,G'*"'1""2',%'#"9,#/),',-"'1:,)-',%%+',%')#(*+'%1+;'9%&"'5(#,/):+(#'2%&(/19',-(,'*"'2"9/#"',%')#(*+4'

X%#'"D(&5+"G'/$'*"'*(1,',%')#(*+',-"'/9,459:4"2:'2%&(/1'3%#'*"BY9/,"=G'*"'1""2',%')%1$/.:#"'471:,)-Y

U4Z7)%1$7)#(*+Y:#+$/+,"#4,D,4''R-"'$%++%*/1.'9)#""19-%,')(5,:#"9'-%*',%'())"99',-/9'5(#,/):+(#'$/+"6'''

'

N"#"G'$/#9,'S')-(1."2'&;'*%#C/1.'2/#"),%#;',%',-"'1:,)-'-%&"'2/#"),%#;'3*-"#"'1:,)-'/9'/19,(++"2='(12'

,-"1'S'())"99/1.',-"'$/+"':9/1.'0/',"D,'"2/,%#4'F1;'%,-"#'"2/,%#')(1'(+9%'B"':9"2'%#';%:')(1'2#(.'(12'2#%5'

,-"'$/+"',%';%:#'*/12%*9'&()-/1"'(12',-"1'"2/,4'R-/9')(1'B"'()-/"0"2'(9'9-%*1'B"+%*6'

'

Page 6: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

PI"*'X/+"'R#(19$"#Q',(B'9-%:+2'%5"1'('1"*'*/12%*'*-"#"';%:')(1'1(0/.(,"',%',-"'2/#"),%#/"9'(12'2#(.'

(12'2#%5'#"<:/#"2'$/+"94'N%*"0"#G'S'(&'(99:&/1.',-"'9-"++'5#%&5,'$%#',-"'9:B9"<:"1,'2",(/+94'

'

'

'

'

'

'

'

'

'

'

'

[='()*+,7"),E-,#2)I#J#&'R-/9'/9'-%*',-"')#(*+Y:#+$/+,"#4,D,'$/+"'$%#',"(&TU'+%%C9'+/C"4'S$';%:'+%%C'(,',-"'

9")%12'+(9,'+/1"'/1',-"'9)#""19-%,G',-/9'/9'*-"#"',-"')%1$/.:#(,/%1'$%#',-"'2%&(/1'/9'9:55+/"24'N"#"'(#"'

9%&"'5(#,/):+(#96'

• S,'/9'('#".:+(#'"D5#"99/%1'*-/)-'/&5+/"9'$",)-/1.'(1;'5(."'*-%9"'B(9"':#+'-(9'P/9,459:4"2:Q4''

• 3\(Y]UYZ^8_4='/&5+/"9',-(,'(1;'9"<:"1)"'&(2"'%$'(+5-(B",'/9'(++%*"2'(12'/,'&:9,'"12'*/,-'P4Q'(12'

P8Q'&"(19',-(,'(1;'1:&B"#'%$'9:)-'9"<:"1)"9'(#"'(++%*"24''

• P`Q'&"(19'())"5,',-"':#+'&(2"'%:,'%$'#".:+(#'"D5#"99/%1',-(,'$%++%*9'P`Q'(12'PYP'/&5+/"9'1%,',%'

())"5,4'F+9%G'P4Q')(1'B"'#"5+()"2'*/,-'(1;'9"<:"1)"4'

'R-"#"$%#"G'/$'('5(."'-(9'/,9'B(9"':#+'(9'-,,5677Ta[92.924/9,459:4"2:'*/++'B"'())"5,"2'B:,'

-,,5677,*",b2$94/9,459:4"2:'*/++'1%,'B"'())"5,"24''R-"'9(&"')(1'B"'2%1"'$%#'(1;'$/+"'"D,"19/%1'(9'*"++4'

@"9,G',-"'$/+"'/9'5#",,;'9"+$Y"D5+(1(,%#;4'

Page 7: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

T='."#$%702E*",#IJ=,&'R-/9'$/+"'/9'#"95%19/B+"'$%#'5#%0/2/1.';%:#')#(*+"#'('1(&"',-(,'*/++'B"'#"./9,"#"2'/1'

,-"'+%.9'%$',-"'9/,"',-(,'/9'B"/1.')#(*+"24'N"#"'/9'-%*';%:')(1'())"99',-"'$+"6'

Page 8: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

'

O1)"';%:'%5"1',-"'$/+"G';%:'1""2',')-(1."',-"'1(&"'%$';%:#'-,,5'(."1,4''R-/9'/9'-%*',-"'-,,5'5#%5"#,/"9'

9"),/%1'%$',-/9'$/+"'$%#',-"',"(&TU'+%%C9'+/C"4''

cdYY'NRR!'5#%5"#,/"9'YYe'

c5#%5"#,;e'

''c1(&"e-,,54(."1,41(&"c71(&"e'

''c0(+:"e#2*=HKc70(+:"e'

''c2"9)#/5,/%1eNRR!'f?9"#YF."1,f'#"<:"9,'-"(2"#4'g?ER'IOR'B"'"&5,;'Y'

''5+"(9"'9",',-/9',%'('9/1.+"'*%#2':1/<:"+;'#"+(,"2',%';%:#'%#.(1/](,/%14'

'

''IORV6'h%:'9-%:+2'(+9%')-")C'%,-"#'#"+(,"2'5#%5"#,/"96'

'

''''''''-,,54#%B%,94(."1,9'

''''''''-,,54(."1,42"9)#/5,/%1'

''''''''-,,54(."1,4:#+'

''''''''-,,54(."1,4"&(/+'

''''''''-,,54(."1,40"#9/%1'

'

''(12'9",',-"/#'0(+:"9'(55#%5#/(,"+;4'

'

''c72"9)#/5,/%1e'

c75#%5"#,;e'

'

1#26L&'()*+,-./&'I%*G'*"'(#"'#"(2;',%')#(*+4'E,(;/1.'/1',-"'1:,)-'-%&"'2/#"),%#;G'$/#9,'*"'1""2',%'

)#"(,"'('$/+"',-(,')(##/"9',-"'9""2':#+'3,-"'5(."',-(,'*/++'B"'$",)-"2'0"#;'$/#9,=4'N"#"'/9'-%*'/,')(1'B"'2%1"6'

Page 9: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

R-/9')%&&(12'*/++')#"(,"'('$/+"'1(&"2':#+'/1',-"'5#"9"1,'*%#C/1.'2/#"),%#;'(12'P")-%Q'*/++'*#/,"'

-,,5677/9,459:4"2:'/1',-"'$/+"4'3I%,"6',-"'9""2':#+')(1'B"'9:55+/"2'(,',-"')%&&(12'+/1"'*-/+"')#(*+/1.G'B:,'

/,'/9'('.%%2'5#(),/)"',%'9:55+;',-"'9""2':#+7:#+9'/1'('$/+"='

I"D,G'*"'5"#$%#&',-"')#(*+'B;'/99:/1.',-"')#(*+')%&&(124'R-"'$%++%*/1.'9)#""19-%,'9-%*9'-%*',%'2%'

,-/96'

'

N"#"G''

• PB/171:,)-Q'/9',-"'"D"):,(B+"'(12',-"'P)#(*+Q'/9',-"'W(0(')+(99',-(,'/&5+"&"1,9',-"')#(*+/1.'

$:1),/%1(+/,;4''

Page 10: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

• P:#+Q'/9',-"'$/+"',-(,')%1,(/19',-"'9""2':#+94'

• 'PY2/#',/1;)#(*+Q'/9',-"'(#.:&"1,9',-(,'9:55+/"9',-(,'1(&"'%$',-"'2/#"),%#;'*-"#"'(++',-"'

9".&"1,9'*/++'B"'9,%#"24''g(C"'9:#"',-(,'P,/1;)#(*+Q'2%"9'1%,'"D/9,'/1',-"'*%#C/1.'2/#"),%#;4'

• PY2"5,-'TQ'95")/$/"9',-"'2"5,-G'/4"4',-"'5(."9',-(,'(#"'(,'('2/9,(1)"':5,%'['*/++'B"'$",)-"24''

R-/9'*/++')%&5+","',-"')#(*+'%5"#(,/%14'J-"1',-"')#(*+/1.'%5"#(,/%1'/9'$/1/9-"2G';%:'*/++'9""'('2/#"),%#;'

1(&"2',/1;)#(*+'/1';%:#'1:,)-'2/#"),%#;4'

S$';%:'(#"'/1,"#"9,"2'/1'(1(+;]/1.'*-(,'-(9'B""1')#(*+"2'B"$%#"';%:'9,(#,'%5"#(,/%19'$%#'9"(#)-/1.G';%:'

9-%:+2'#"(2'-,,5677*/C/4(5()-"4%#.71:,)-7Ui>%&&(12A/1"O5,/%194'R-/9'5(."')%1,(/19',-"')%&&(12'

+/1"'%5,/%19'(0(/+(B+"'$%#'1:,)-'(12'#"(22B'(12'#"(2+/1C2B'(#"',-"')+(99"9',-(,')(1'B"':9"2'$%#'(1(+;9/9'

5:#5%9"94'

'

'

'

'

'

'

'

'

'

'

'

'

'

'

'

'

'

'

Page 11: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

1#26G&'12*)$%-./'

X%#',-"'9"(#)-'#"<:"9,9'3/4"4'<:"#/"9=',%'B"'9"#0"2G'*"'1""2'('9"#0+",')%1,(/1"#',-(,')(1'#")"/0"',-"'

#"<:"9,G'#"(2',-"'/12"D'(12'9"12'B()C',-"'#"9:+,94'J"'*/++'B"':9/1.',%&)(,'$%#',-/9'5:#5%9"4'X/#9,G'+",':9'

9""'*-"#"',-"',%&)(,'/9'/19,(++"24'

'

'

E)#""19-%,'(B%0"'9-%*9'*-"#"',-"',%&)(,'/9'/19,(++"24'N"#"G'$/#9,'S'1(0/.(,"2',%',-"'5(#"1,'2/#"),%#;'3)2'

44=''(12'/99:"2',-"'+9')%&&(12',%'9""',-"')%1,"1,'%$',-"',"(&9TUK9'-%&"'2/#"),%#;4''P(5()-"Y,%&)(,Y

j4U4[aQ'/9',-"'2/#"),%#;'*-"#"',%&)(,'/9'/19,(++"24'R-"'2/#"),%#;'+/9,/1.'3+9='%$',-/9'2/#"),%#;'9-%*9',-"'

)%1,"1,'%$',-/9'2/#"),%#;4''PB/1Q'/9',-"'2/#"),%#;'*-"#"'(++',-"'"D"):,(B+"'(#"'+%)(,"2'(12'P*"B(55Q'/9'

,-"'2/#"),%#;'*-"#"'(++',-"'*"B'(55+/)(,/%1',-(,'#:1'/19/2"',%&)(,'(#"'2"5+%;"24''

I"D,G'*"'1""2',%'2"5+%;'%:#'1:,)-'(55+/)(,/%1',%',-/9',%&)(,'9"#0"#4'N"#"'/9'-%*',-/9'/9'2%1"6'

[= X/#9,G'*"'1""2',%'#"&%0"',-"'#%%,'(55+/)(,/%1',-(,'/9'#:11/1.'/1',-"',%&)(,'(12',-"1')%5;',-"'

1:,)-'*"B'(55+/)(,/%1'$/+"'31:,)-4*(#='/1,%',-/9'2/#"),%#;4'X%++%*/1.',*%')%&&(129',(C"')(#"'%$'

,-/9'%5"#(,/%14'''

• #&'Y#$'*"B(5597@OOR8'

• )5'4471:,)-YU4Z71:,)-YU4Z4*(#''*"B(5597@OOR4*(#'

g(C"'9:#"';%:'(#"'5#"9"1,'/1',-"'(55#%5#/(,"'2/#"),%#;'B"$%#"'#:11/1.',-"9"')%&&(129'(12',-"'

2/#"),%#;'/9',-"'-%&"'2/#"),%#;'%$',%&)(,4'X%++%*/1.'9)#""19-%,'9-%:+2')+"(#'(1;'2%:B,6'

Page 12: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

I"D,G'*"'1""2',%'9,(#,',-"',%&)(,'9"#0"#'*-/)-'/9'2%1"'B;'$%++%*/1.')%&&(126'

• B/17)(,(+/1(49-'9,(#,'

S,'9-%:+2'2/95+(;'/1$%#&(,/%1'(9'9-%*1'/1',-"'9)#""19-%,6'

'

Page 13: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

I%*G',-"',%&)(,'9"#0"#'/9'#:11/1.'B:,'/,'/9'1%,')%1$/.:#"2';",'(12',-"#"$%#"G'/,')(11%,'$/12',-"'

/12"D',-(,'1:,)-')#"(,"2'/1',-"'+(9,'9,"54'E%G'*"'*/++')%1$/.:#"',-"',%&)(,'9"#0"#'B"$%#"'9-%*/1.'

%$$'*/,-',-"'9"(#)-/1.4''

R-"#"'(#"',*%'$/+"9',-(,'1""2',%'B"')%1$/.:#"2'*-/)-'(#"''

• *"B(5597@OOR7JVLYSIX7)+(99"971:,)-Y9/,"4D&+'

• )%1$79"#0"#4D&+'

?8'."#$%75-#2IJ=,''

'

R-"')%1,"1,'%$',-"'$/#9,'$/+"'/9'9-%*1'B"+%*4''

<?xml version="1.0"?>

<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

</configuration>

'

F12G'/,'9-%:+2'+%%C'+/C"'(9'9-%*1'B"+%*6'

'

<?xml version="1.0"?>

<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<nutch-conf>

<property>

<name>searcher.dir</name>

<value>'/home/team20/nutch-0.9/tinycrawl</value> </property>

</nutch-conf>

'

'

I%,"',-"',*%'&(/1')-(1."9G''

• from href="configuration.xsl" to href="nutch-conf.xsl”'

• conten inside <nutch-conf>..</nutch-conf>'

R-"'$/#9,')-(1."'/9'/12/)(,/1.',-"'1"*')%1$/.:#(,/%1'$/+"'*-"#"',-"'2/95+(;'%#/"1,"2'/1$%#&(,/%1'

$%#';%:'1:,)-'*/++'B"'5:,4'F12',-"'9")%12')-(1."'/9',-"'5(,-'%$',-"'2/#"),%#;'*-"#"';%:#')#(*+'

/94'3N"#"',"(&TU'/9'(99:&"24'h%:'9-%:+2'5:,'())%#2/1.',%';%:#',"(&'1:&B"#4='

'

H8 52)<2)IJ=,&'

E/1)"G'"0"#;',"(&'*/++'B"'*%#C/1.'*/,-',-"'9(&"',%&)(,'9"#0"#G'*"'*/++'1""2',%'#:1'2/$$"#"1,'

/19,(1)"9'%$',-"',%&)(,'9"#0"#4'R%'/2"1,/$;'*-/)-'/12"D',%'*%#C'*/,-'3/4"4'*-/)-'/19,(1)"'%$'

,%&)(,',%'*%#C'*/,-=G',%&)(,'9"#0"#':9"9',-"')%&B/1(,/%1'%$'S!'(22#"99'%$',-"'9"#0"#'&()-/1"'

Page 14: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

(12',-"'5%#,'%1'*-/)-',-"',%&)(,'/9'+/9,"1/1.4'E/1)"';%:')(11%,')-(1."',-"'S!'(22#"99'%$',-"'

&()-/1"G';%:'*/++')-(1."',-"'5%#,4''

'

R-"'9"#0"#4D&+'$/+"')%1,(/19',-"'(B%0"'&"1,/%1"2')%1$/.:#(,/%1'2",(/+'(12',-"'#"+"0(1,'D&+',(.K9'

)%1,"1,'%$'('$#"9-+;')#"(,"2'/19,(1)"'/9'9-%*1'B"+%*4'

<!-- A "Connector" represents an endpoint by which requests are

received

and responses are returned. Documentation at :

Java HTTP Connector: /docs/config/http.html (blocking & non-

blocking)

Java AJP Connector: /docs/config/ajp.html

APR (HTTP/AJP) Connector: /docs/apr.html

Define a non-SSL HTTP/1.1 Connector on port 8080

-->

<Connector port="8080" protocol="HTTP/1.1"

connectionTimeout="20000"

redirectPort="8443" />

R-"'+(9,',-#""'+/1"9')%1,(/19',-"')%1$/.:#(,/%14'N"#"',-"'5%#,'1:&B"#'/9'iUiU4'E/1)"'*"'*/++':9"'

2/$$"#"1,'5%#,'$%#'2/$$"#"1,',"(&9G'/,'/9'#")%&&"12"2',-(,';%:'&%2/$;',-"'5%#,'1:&B"#',%'

iUiU`;%:#',"(&'1:&B"#4'/4"4'/1',-/9')(9"G'iUiU`TUkZUUU4'39/1)"'S'(&'*%#C/1.'*/,-',"(&'TU='

'

I%*'*"'1""2',%'#"9,(#,',-"',%&)(,'9"#0"#4'E%G'9,%5'/,'$/#9,'(12',-"1'9,(#,4'E%G',-"9"',*%'

)%&&(129'9-%:+2'2%',-"'*%#C6'

• B/17)(,(+/1(49-'9,%5'

• B/17)(,(+/1(49-'9,(#,'

3N"#"'S'-(0"'(99:&"2',-(,';%:#'P5*2Q'/9'P7-%&"7,"(&l7(5()-"Y,%&)(,Yj4U4[aQ4',"(&l'&"(19'

,"(&'1:&B"#4='

'

F99:&/1.',-(,'&;',"(&'1:&B"#'/9'TU'(12'-"1)"',-"'5%#,'1:&B"#'/9'iUiU`TUki[UUG'$%#'

())"99/1.',-"'1:,)-'9"(#)-'/1,"#$()"G'S'*/++',;5"''-,,5677/9,aa[4/9,459:4"2:6i[UU7"17'/1',-"'

B#%*9"#4'

N"#"'/9',-"'9)#""19-%,'$%#',-"'/1,"#$()"6'

Page 15: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

'

F12'/$'S'9"(#)-'$%#'P/1$%#&(,/%1QG'-"#"'/9'*-(,'S'.",6'

Page 16: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

!+"(9"'9,%5';%:#',%&)(,'/19,(1)"'%1)"';%:'(#"'2%1"'$%#'5#%5"#'(0(/+(B/+/,;'%$',-"'9;9,"&'

'

'

'

'

'

'

'

'

'

'

'

'

Page 17: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

4.*,BM-./'#%2'4.*,BM2)5&'

'

R-"')%2"'B"+%*':9"9',-"'(1(+;]"#9'95")/$/"2'(12'/&5+"&"1,"2'/1'A:)"1"',%'(1(+;]"'(12'

,%C"1/]"',-"'9,#/1.'5#%0/2"2'B;',-"':9"#4''E/1)"'A:)"1"'*/++'B"'/12"D/1.',-"',%C"19'."1"#(,"2'B;'

,-"'(1(+;]"#9G',-"#"$%#"'/,'/9'/19,#:),/0"',%'+%%C'(,',-"',%C"1/](,/%1'2%1"'B;',-"'2/$$"#"1,'

(1(+;]"#94''

//AnalysisDemo.java

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.lucene.analysis.StopAnalyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.Token;

import org.apache.lucene.analysis.SimpleAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import java.io.*;

import java.io.IOException;

public class AnalysisDemo {

private static String string;

private static final Analyzer[] analyzers = new Analyzer[]{

new WhitespaceAnalyzer(),

new SimpleAnalyzer(),

new StopAnalyzer(),

new StandardAnalyzer(),

};

public static void main(String[] args) throws IOException {

BufferedReader br = new BufferedReader(new

InputStreamReader(System.in));

System.out.println("\nEnter the string that you want to analyze:");

string = br.readLine();

analyze(string);

}

private static void analyze(String text) throws IOException {

System.out.println("Analzying \"" + text + "\"");

for (int i = 0; i < analyzers.length; i++) {

Analyzer analyzer = analyzers[i];

System.out.println("\t" + analyzer.getClass().getName() + ":");

System.out.print("\t\t");

TokenStream stream = analyzer.tokenStream("contents", new

StringReader(text));

while (true) {

Token token = stream.next();

if (token == null) break;

System.out.print("[" + token.termText() + "] ");

}

System.out.println("\n");

}

}

}

Page 18: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

()2*#-./'*'$"5#>='-.7=2=>)B'D.02J'*.0'52*)$%-./'>.'-#&'

R-"')%2"'B"+%*',(C"9'95")/$/"2'2%):&"1,9'(12')#"(,"9'('A:)"1"'/12"D'*-/)-'/9'C"5,'/1'

,-"'&"&%#;4'A:)"1"'5#%0/2"9')+(99"9'$%#'"()-'%$',-"'$%++%*/1.'%5"#(,/%19',-(,'(#"'

1")"99(#;'$%#'9"(#)-'"1./1"'%5"#(,/%16'

• /12"D')#"(,/%1'

• /12"D'9"(#)-/1.'

• <:"#;'5(#9/1.'

• )%1,"1,'(1(+;9/9'

R-"')%2"'B"+%*'9-%*9'(++',-"'%5"#(,/%19'/1'(),/%14'S,'/9'(20/9"2',%'5+(;'*/,-',-/9')%2"'

(12'9""'/,'/1'(),/%1'$%#'+"(#1/1.'5:#5%9"94'X%#'%,-"#')+(99"9'(12'$:1),/%1(+/,/"9G'.%',%',-"'

+:)"1"'*"B9/,"'(12'B#%*9"',-"')+(99'-/"#(#)-;4'

'

'

//LuceneExample.java

import java.io.IOException;

import java.io.StringReader;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.Query;

import org.apache.lucene.document.Field;

import org.apache.lucene.search.Searcher;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.document.Document;

import org.apache.lucene.store.RAMDirectory;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class LuceneExample

{

public static void main(String[] args)

{

// Construct a RAMDirectory to hold the in-memory representation

// of the index.

RAMDirectory idx = new RAMDirectory();

try

{

// Make an writer to create the index

IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(),

true);

// Add some Document objects containing quotes

writer.addDocument(createDocument("Theodore Roosevelt",

"It behooves every man to remember that the work of the "

+ "critic, is of altogether secondary importance, and that, "

+ "in the end, progress is accomplished by the man who does "

+ "things."));

writer.addDocument(createDocument("Friedrich Hayek",

Page 19: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

"The case for individual freedom rests largely on the "

+ "recognition of the inevitable and universal ignorance "

+ "of all of us concerning a great many of the factors on "

+ "which the achievements of our ends and welfare depend."));

writer.addDocument(createDocument("Ayn Rand",

"There is nothing to take a manâ!™s freedom away from "

+ "him, save other men. To be free, a man must be free "

+ "of his brothers."));

writer.addDocument(createDocument("Mohandas Gandhi",

"Freedom is not worth having if it does not connote "

+ "freedom to err."));

// Optimize and close the writer to finish building the index

writer.optimize();

writer.close();

// Build an IndexSearcher using the in-memory index

Searcher searcher = new IndexSearcher(idx);

// Run some queries

search(searcher, "freedom");

search(searcher, "free");

search(searcher, "progress or achievements");

searcher.close();

}

catch (IOException ioe)

{

// In this example we arenâ!™t really doing an I/O, so this

// exception should never actually be thrown.

ioe.printStackTrace();

}

catch (ParseException pe)

{

pe.printStackTrace();

}

}

/**

* Make a Document object with an un-indexed title field and an indexed

* content field.

*/

private static Document createDocument(String title, String content)

{

Document doc = new Document();

// Add the title as an unindexed fieldâ!"

doc.add(new Field("title", title, Field.Store.YES, Field.Index.NO));

// and the content as an indexed field. Note that indexed

// Text fields are constructed using a Reader. Lucene can read

// and index very large chunks of text, without storing the

// entire content verbatim in the index. In this example we

// can just wrap the content string in a StringReader.

doc.add(new Field("content", new StringReader(content)));

return doc;

}

/**

Page 20: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

* Searches for the given string in the "content" field

*/

private static void search(Searcher searcher, String queryString)

throws ParseException, IOException

{

// Build a Query object

QueryParser parser = new QueryParser("content", new StandardAnalyzer());

Query query = parser.parse(queryString);

// Search for the query

Hits hits = searcher.search(query);

// Examine the Hits object to see if there were any matches

int hitCount = hits.length();

if (hitCount == 0)

{

System.out.println("No matches were found for \"" + queryString +

"\"");

}

else

{

System.out.println("Hits for \"" + queryString

+ "\" were found in quotes by:");

// Iterate over the Documents in the Hits object

for (int i = 0; i < hitCount; i++)

{

Document doc = hits.doc(i);

// Print the value that we stored in the "title" field. Note

// that this Field was not indexed, but (unlike the

// "contents" field) was stored verbatim and can be

// retrieved.

System.out.println(" " + (i + 1) + ". " + doc.get("title"));

}

}

System.out.println();

}

}

Page 21: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

1#>)-./'*.'D.02J'>.'#%2'=2=>)B&'

R-"')%2"'B"+%*'9-%*9'-%*',%'9,%#"'(1'/12"D'%1',-"'-(#2Y2#/0"4'S,',(C"9',-"'/15:,'(9',-"'

2/#"),%#;',-(,'1""29',%'B"'/12"D"2'(12',-"'2/#"),%#;'*-"#"';%:'*(1,',%'*#/,"',-"'/12"D4'

//Indexer.java

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import java.io.File;

import java.io.IOException;

import java.io.FileReader;

public class Indexer {

public static void index(File indexDir, File dataDir) throws IOException

{

if (!dataDir.exists() || !dataDir.isDirectory()) {

throw new IOException(dataDir + " does not exist or is not a

directory");

}

IndexWriter writer = new IndexWriter(indexDir, new

StandardAnalyzer(), true);

indexDirectory(writer, dataDir);

writer.close();

}

private static void indexDirectory(IndexWriter writer, File dir) throws

IOException {

File[] files = dir.listFiles();

for (int i=0; i < files.length; i++) {

File f = files[i];

if (f.isDirectory()) {

indexDirectory(writer, f); // recurse

} else if (f.getName().endsWith(".txt")) {

indexFile(writer, f);

}

}

}

private static void indexFile(IndexWriter writer, File f) throws

IOException {

System.out.println("Indexing " + f.getName());

Document doc = new Document();

doc.add(Field.Text("contents", new FileReader(f)));

doc.add(Field.Keyword("filename", f.getCanonicalPath()));

writer.addDocument(doc);

}

public static void main(String[] args) throws Exception {

if (args.length != 2) {

Page 22: 0'12*)$%-./'34'5#267+-52'/-028' - Lee Gilesvalue>'/home/team20/nutch-0.9/tinycrawl   ' ' I%,"',-"',*%'&(/1')-(1."9G

throw new Exception("Usage: " + Indexer.class.getName() + "

<index dir> <data dir>");

}

File indexDir = new File(args[0]);

File dataDir = new File(args[1]);

index(indexDir, dataDir);

}

}

'

'