summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/test/t_factors
blob: c1127077832db893f234acfc4b9b8a8a86d3878c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/bin/sh
# Part of the ht://Dig package   <http://www.htdig.org/>
# Copyright (c) 1999-2004 The ht://Dig Group
# For copyright details, see the file COPYING in your distribution
# or the GNU Library General Public License (LGPL) version 2 or later
# <http://www.gnu.org/copyleft/lgpl.html>
#
# $Id: t_factors,v 1.7 2004/06/05 06:26:22 lha Exp $
#

# Tests (or should eventually test) the following config attributes:
#	author_factor
#	backlink_factor
#	caps_factor
#	date_factor		(TODO)
#	description_factor
#	heading_factor
#	keywords_factor
#	meta_description_factor
#	multimatch_factor
#	search_results_order
#	text_factor
#	title_factor
#	url_seed_score
#	url_text_factor

# try_order comment query pattern1 patern2 ...
#	comment	- description of test, displayed if error occurs
#	query	- search string passed to  htsearch
#	pattern	- strings expected to occur *in order* in the output
try_order() {
    comment="$1"
    shift
    query="$1"
    shift
    $htsearch -c $config "$query" > $tmp 2> /dev/null
    array=""
    for pattern
    do
	array="$array; array[i++] = "\"$pattern\"
    done
    miss=`$awk "BEGIN {$array; line = 0; } \
	    "'$0'" ~ \".*\"array[line] { line++ } \
	    END { print array[line] } " < $tmp `
    if [ "$miss" != "" ]
    then
	$htsearch -vv -c $config "$query" > /dev/null
	echo "String \"$miss\" was not found where expected"
	fail "$htsearch -c $config '$query' >> $tmp --
	      $comment"
    fi
}




test_functions_action=--start-apache
. ./test_functions

config=$testdir/conf/htdig.conf.tmp
tmp=/tmp/t_htsearch$$

# set up config file with chosen non-default values
cp $testdir/conf/htdig.conf $config

$htdig "$@" -t -i -c $config	|| fail "Couldn't dig"
$htpurge -c $config		|| fail "Couldn't purge"

try_order "Search for 'also'" \
    "words=also" \
    '4 matches' 'site2.html' 'site4.html' 'bad_local.htm' 'script.html'

set_attr url_seed_score		"site4 *1000+1000"
try_order "Seed score 1000 for site4.html" \
    "words=also" \
    '4 matches' 'site4.html' 'site2.html' 'bad_local.htm' 'script.html'

set_attr url_seed_score		"site4 *1000+1000 script *1000+1000"
try_order "Seed score 1000 for site4.html and script.html" \
    "words=also" \
    '4 matches' 'site4.html' 'script.html' 'site2.html' 'bad_local.htm'

set_attr url_seed_score		"site4|script *1000+1000"
try_order "Seed score 1000 for site4|script" \
    "words=also" \
    '4 matches' 'site4.html' 'script.html' 'site2.html' 'bad_local.htm'

set_attr search_results_order		"bad_local"
try_order "Search_results_order bad_local" \
    "words=also" \
    '4 matches' 'bad_local.htm' 'site4.html' 'script.html' 'site2.html'

set_attr search_results_order		"script * e2|e4"
try_order "Search_results_order * script e2|e4" \
    "words=also" \
    '4 matches' 'script.html' 'bad_local.htm' 'site4.html' 'site2.html'

set_attr url_seed_score			""
set_attr search_results_order		""
set_attr author_factor			0
set_attr backlink_factor		0
set_attr caps_factor			0	# not implemented
set_attr date_factor			0	# TODO
set_attr description_factor		0
set_attr heading_factor			0
set_attr keywords_factor		0
set_attr meta_description_factor	0
set_attr multimatch_factor		0
set_attr text_factor			0
set_attr title_factor			0
set_attr url_text_factor		0	# not implemented

try_order "Search with factors 0" \
    "words=also" \
    'No matches'

try_order "Search for 'service' with title_factor 0" \
    "words=service" \
    'No matches'
set_attr title_factor		1
try_order "Search for 'service' with title_factor 1" \
    "words=service" \
    '1 matches' 'script.html'
set_attr text_factor		0.3
try_order "Greater weight to title factor" \
    "words=service" \
    '4 matches' 'script.html' 'site4.html' 'site%201.html' 'site3.html'
set_attr title_factor		-3.2
try_order "Checking negative title factor" \
    "words=service" \
    '4 matches' 'site4.html' 'site%201.html' 'site3.html' 'script.html'
set_attr title_factor		0
set_attr text_factor		0

# test with all factors 0 except the one which matches

set_attr description_factor		1
try_order "Search for 'crossRef' with description_factor 1" \
    "words=crossRef" \
    '1 matches' 'site%201.html'
set_attr description_factor		0

set_attr author_factor			1
try_order "Search for 'media' with author_factor 1" \
    "words=media" \
    '1 matches' 'script.html'
set_attr author_factor			0

set_attr meta_description_factor	1
try_order "Search for 'stars' with meta_description_factor 1" \
    "words=stars" \
    '1 matches' 'site2.html'
set_attr meta_description_factor	0

set_attr heading_factor			1
try_order "Search for 'obtain' with heading_factor 1" \
    "words=obtain" \
    '1 matches' 'bad_local.htm'
set_attr heading_factor			0

set_attr keywords_factor		1
try_order "Search for 'newWord' with keywords_factor 1" \
    "words=newWord" \
    '1 matches' 'title.html'
set_attr keywords_factor		0


# test with all document-based factors non-zero except the one which matches
set_attr author_factor			1
#set_attr backlink_factor		1	# not document based
set_attr caps_factor			1
#set_attr date_factor			1	# not document based
set_attr description_factor		1
set_attr heading_factor			1
set_attr keywords_factor		1
set_attr meta_description_factor	1
set_attr multimatch_factor		1
set_attr text_factor			1
set_attr title_factor			1
set_attr url_text_factor		1
set_attr description_factor		1

set_attr description_factor		0
try_order "Search for 'crossRef' with description_factor 0" \
    "words=crossRef" \
    '1 matches' 'title.html'
set_attr description_factor		1

set_attr author_factor			0
try_order "Search for 'media' with author_factor 0" \
    "words=media" \
    'No matches'
set_attr author_factor			1

set_attr meta_description_factor	0
try_order "Search for 'stars' with meta_description_factor 0" \
    "words=stars" \
    'No matches'
set_attr meta_description_factor	1

set_attr heading_factor			0
try_order "Search for 'obtain' with heading_factor 0" \
    "words=obtain" \
    'No matches'
set_attr heading_factor			1

set_attr keywords_factor		0
try_order "Search for 'newWord' with keywords_factor 0" \
    "words=newWord" \
    'No matches'
set_attr keywords_factor		1

# multimatch_factor gives a "boost" to searches matching multiple terms
set_attr title_factor			10	# "get" in title of bad_local
set_attr multimatch_factor		10000
try_order "Search for 'get or interest or repay' with multimatch_factor 10000" \
    "words=get+interest+repay;method=or" \
    '2 matches' 'site4.html' 'bad_local.htm'
set_attr multimatch_factor		0
try_order "Search for 'get or interest or repay' with multimatch_factor 0" \
    "words=get+interest+repay;method=or" \
    '2 matches' 'bad_local.htm' 'site4.html'

# backlink counts the number of references (of any type) to this document
set_attr backlink_factor		0
try_order "site4.html has repay+interest, site 1.html only has suggestions" \
    "words=suggestions+repay+interest;method=or" \
    '2 matches' 'site4.html' 'site%201.html'
set_attr backlink_factor		100
try_order "site 1.html has a higher ratio of backlinks to outgoing links" \
    "words=suggestions+repay+interest;method=or" \
    '2 matches' 'site%201.html' 'site4.html'

test_functions_action=--stop-apache
. ./test_functions