SHOGUN
3.2.1
首页
相关页面
模块
类
文件
文件列表
文件成员
全部
类
命名空间
文件
函数
变量
类型定义
枚举
枚举值
友元
宏定义
组
页
src
shogun
features
StringFileFeatures.cpp
浏览该文件的文档.
1
#include <
shogun/features/StringFileFeatures.h
>
2
3
namespace
shogun
4
{
5
6
template
<
class
ST>
CStringFileFeatures<ST>::CStringFileFeatures
() :
CStringFeatures
<ST>(), file(NULL)
7
{
8
}
9
10
template
<
class
ST>
CStringFileFeatures<ST>::CStringFileFeatures
(
const
char
* fname,
EAlphabet
alpha)
11
:
CStringFeatures
<ST>(alpha)
12
{
13
file
=
new
CMemoryMappedFile<ST>
(fname);
14
fetch_meta_info_from_file
();
15
}
16
17
template
<
class
ST>
CStringFileFeatures<ST>::~CStringFileFeatures
()
18
{
19
SG_UNREF
(file);
20
CStringFileFeatures<ST>::cleanup
();
21
}
22
23
template
<
class
ST> ST*
CStringFileFeatures<ST>::get_line
(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
24
{
25
ST* s = file->get_map();
26
for
(uint64_t i=offs; i<file_length; i++)
27
{
28
ST c=s[i];
29
30
if
(c ==
'\n'
)
31
{
32
ST* line=&s[offs];
33
len=i-offs;
34
offs=i+1;
35
line_nr++;
36
return
line;
37
}
38
else
39
{
40
if
(!
CStringFeatures<ST>::alphabet
->is_valid((uint8_t) c))
41
{
42
CStringFileFeatures<ST>::cleanup
();
43
SG_CLASS_ERROR
(
CStringFeatures<ST>
,
"Invalid character (%c) in line %d\n"
, c, line_nr)
44
}
45
}
46
}
47
48
len=0;
49
offs=file_length;
50
return
NULL;
51
}
52
53
template
<
class
ST>
void
CStringFileFeatures<ST>::cleanup
()
54
{
55
CStringFeatures<ST>::num_vectors
=0;
56
SG_FREE(
CStringFeatures<ST>::features
);
57
SG_FREE(
CStringFeatures<ST>::symbol_mask_table
);
58
CStringFeatures<ST>::features
=NULL;
59
CStringFeatures<ST>::symbol_mask_table
=NULL;
60
61
/* start with a fresh alphabet, but instead of emptying the histogram
62
* create a new object (to leave the alphabet object alone if it is used
63
* by others)
64
*/
65
CAlphabet
* alpha=
new
CAlphabet
(
CStringFeatures<ST>::alphabet
->get_alphabet());
66
SG_UNREF
(
CStringFeatures<ST>::alphabet
);
67
CStringFeatures<ST>::alphabet
=alpha;
68
SG_REF
(
CStringFeatures<ST>::alphabet
);
69
}
70
71
template
<
class
ST>
void
CStringFileFeatures<ST>::cleanup_feature_vector
(int32_t num)
72
{
73
SG_CLASS_ERROR
(
CStringFeatures<ST>
,
"Cleaning single feature vector not"
74
"supported by StringFileFeatures\n"
)
75
}
76
77
template
<
class
ST>
void
CStringFileFeatures<ST>::fetch_meta_info_from_file
(int32_t granularity)
78
{
79
CStringFileFeatures<ST>::cleanup
();
80
uint64_t file_size=file->get_size();
81
ASSERT
(granularity>=1)
82
ASSERT
(
CStringFeatures<ST>::alphabet
)
83
84
int64_t buffer_size=granularity;
85
CStringFeatures<ST>::features
=SG_MALLOC(
SGString<ST>
, buffer_size);
86
87
uint64_t offs=0;
88
uint64_t len=0;
89
CStringFeatures<ST>::max_string_length
=0;
90
CStringFeatures<ST>::num_vectors
=0;
91
92
while
(
true
)
93
{
94
ST* line=get_line(len, offs,
CStringFeatures<ST>::num_vectors
, file_size);
95
96
if
(line)
97
{
98
if
(
CStringFeatures<ST>::num_vectors
> buffer_size)
99
{
100
CStringFeatures<ST>::features
= SG_REALLOC(
SGString<ST>
,
CStringFeatures<ST>::features
, buffer_size, buffer_size+granularity);
101
buffer_size+=granularity;
102
}
103
104
CStringFeatures<ST>::features
[
CStringFeatures<ST>::num_vectors
-1].string=line;
105
CStringFeatures<ST>::features
[
CStringFeatures<ST>::num_vectors
-1].slen=len;
106
CStringFeatures<ST>::max_string_length
=
CMath::max
(
CStringFeatures<ST>::max_string_length
, (int32_t) len);
107
}
108
else
109
break
;
110
}
111
112
SG_CLASS_INFO
(
CStringFeatures<ST>
,
"number of strings:%d\n"
,
CStringFeatures<ST>::num_vectors
)
113
SG_CLASS_INFO
(
CStringFeatures<ST>
,
"maximum string length:%d\n"
,
CStringFeatures<ST>::max_string_length
)
114
SG_CLASS_INFO
(
CStringFeatures<ST>
,
"max_value_in_histogram:%d\n"
,
CStringFeatures<ST>::alphabet
->get_max_value_in_histogram())
115
SG_CLASS_INFO
(
CStringFeatures<ST>
,
"num_symbols_in_histogram:%d\n"
,
CStringFeatures<ST>::alphabet
->get_num_symbols_in_histogram())
116
117
if
(!
CStringFeatures<ST>::alphabet
->check_alphabet_size() || !
CStringFeatures<ST>::alphabet
->check_alphabet())
118
CStringFileFeatures<ST>::cleanup
();
119
120
CStringFeatures<ST>::features
=SG_REALLOC(
SGString<ST>
,
CStringFeatures<ST>::features
, buffer_size,
CStringFeatures<ST>::num_vectors
);
121
}
122
123
template
class
CStringFileFeatures<bool>
;
124
template
class
CStringFileFeatures<char>
;
125
template
class
CStringFileFeatures<int8_t>
;
126
template
class
CStringFileFeatures<uint8_t>
;
127
template
class
CStringFileFeatures<int16_t>
;
128
template
class
CStringFileFeatures<uint16_t>
;
129
template
class
CStringFileFeatures<int32_t>
;
130
template
class
CStringFileFeatures<uint32_t>
;
131
template
class
CStringFileFeatures<int64_t>
;
132
template
class
CStringFileFeatures<uint64_t>
;
133
template
class
CStringFileFeatures<float32_t>
;
134
template
class
CStringFileFeatures<float64_t>
;
135
template
class
CStringFileFeatures<floatmax_t>
;
136
}
SHOGUN
机器学习工具包 - 项目文档