OceanBase Plugin Development Kit
OceanBase Plugin Development Kit
载入中...
搜索中...
未找到
space_ftparser.cpp
浏览该文件的文档.
1/*
2 * Copyright (c) 2025 OceanBase.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <new>
17
19
26namespace oceanbase {
27namespace example {
28
29class ObSpaceFTParser final
30{
31public:
32 ObSpaceFTParser() = default;
33 virtual ~ObSpaceFTParser();
34
35 int init(ObPluginDatum param);
36 void reset();
38 const char *&word,
39 int64_t &word_len,
40 int64_t &char_len,
41 int64_t &word_freq);
42
43private:
44 ObPluginDatum cs_ = 0;
45 const char * start_ = nullptr;
46 const char * next_ = nullptr;
47 const char * end_ = nullptr;
48 bool is_inited_ = false;
49};
50
51#define _MY_U 01
52#define _MY_L 02
53#define _MY_NMR 04
54#define _MY_SPC 010
55#define _MY_PNT 020
56#define _MY_CTR 040
57#define _MY_B 0100
58#define _MY_X 0200
59
60#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
61
66
68{
69 cs_ = 0;
70 start_ = nullptr;
71 next_ = nullptr;
72 end_ = nullptr;
73 is_inited_ = false;
74}
75
77{
78 int ret = OBP_SUCCESS;
79 const char *fulltext = obp_ftparser_fulltext(param);
80 int64_t ft_length = obp_ftparser_fulltext_length(param);
82
83 if (is_inited_) {
84 ret = OBP_INIT_TWICE;
85 OBP_LOG_WARN("init twice. ret=%d, param=%p, this=%p", ret, param, this);
86 } else if (0 == param
87 || 0 == cs
88 || nullptr == fulltext
89 || 0 >= ft_length) {
91 OBP_LOG_WARN("invalid arguments, ret=%d, param=%p", ret, param);
92 } else {
93 cs_ = cs;
94 start_ = fulltext;
95 next_ = start_;
96 end_ = start_ + ft_length;
97 is_inited_ = true;
98 }
99 if (ret != OBP_SUCCESS && !is_inited_) {
100 reset();
101 }
102 OBP_LOG_INFO("ftparser init done. ret=%d", ret);
103 return ret;
104}
105
107 const char *&word,
108 int64_t &word_len,
109 int64_t &char_len,
110 int64_t &word_freq)
111{
112 int ret = OBP_SUCCESS;
113 int mbl = 0;
114 word = nullptr;
115 word_len = 0;
116 char_len = 0;
117 word_freq = 0;
118 if (!is_inited_) {
119 ret = OBP_PLUGIN_ERROR;
120 OBP_LOG_WARN("space ft parser isn't initialized. ret=%d, is_inited=%d", ret, is_inited_);
121 } else {
122 const char *start = start_;
123 const char *next = next_;
124 const char *end = end_;
125 const ObPluginCharsetInfoPtr cs = cs_;
126 do {
127 while (next < end) {
128 int ctype;
129 mbl = obp_charset_ctype(cs, &ctype, (unsigned char *)next, (unsigned char *)end);
130 if (true_word_char(ctype, *next)) {
131 break;
132 }
133 next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
134 }
135 if (next >= end) {
136 ret = OBP_ITER_END;
137 } else {
138 int64_t c_nums = 0;
139 start = next;
140 while (next < end) {
141 int ctype;
142 mbl = obp_charset_ctype(cs, &ctype, (unsigned char *)next, (unsigned char *)end);
143 if (!true_word_char(ctype, *next)) {
144 break;
145 }
146 ++c_nums;
147 next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
148 }
149 if (0 < c_nums) {
150 word = start;
151 word_len = next - start;
152 char_len = c_nums;
153 word_freq = 1;
154 start = next;
155 break;
156 } else {
157 start = next;
158 }
159 }
160 } while (ret == OBP_SUCCESS && next < end);
161 if (OBP_ITER_END == ret || OBP_SUCCESS == ret) {
162 start_ = start;
163 next_ = next;
164 end_ = end;
165 }
166 OBP_LOG_TRACE("next word. start=%p, next=%p, end=%p", start_, next_, end_);
167 }
168 return ret;
169}
170
171} // namespace example
172} // namespace oceanbase
173
174using namespace oceanbase::example;
175
177{
178 int ret = OBP_SUCCESS;
179 ObSpaceFTParser *parser = new (std::nothrow) ObSpaceFTParser;
180 ret = parser->init(param);
181 if (OBP_SUCCESS != ret) {
182 delete parser;
183 return ret;
184 }
185 obp_ftparser_set_user_data(param, (parser));
186 return OBP_SUCCESS;
187}
188
190{
192 delete parser;
194 return OBP_SUCCESS;
195}
196
198 char **word,
199 int64_t *word_len,
200 int64_t *char_cnt,
201 int64_t *word_freq)
202{
203 int ret = OBP_SUCCESS;
204 if (word == nullptr || word_len == nullptr || char_cnt == nullptr || word_freq == nullptr) {
206 } else {
208 ret = parser->get_next_token((const char *&)(*word), *word_len, *char_cnt, *word_freq);
209 }
210 return ret;
211}
212
213int ftparser_get_add_word_flag(uint64_t *flag)
214{
215 int ret = OBP_SUCCESS;
216 if (flag == nullptr) {
218 } else {
223 }
224 return ret;
225}
226
235{
236 int ret = OBP_SUCCESS;
238 ObPluginFTParser parser = {
239 .init = NULL,
240 .deinit = NULL,
241 .scan_begin = ftparser_scan_begin,
242 .scan_end = ftparser_scan_end,
243 .next_token = ftparser_next_token,
244 .get_add_word_flag = ftparser_get_add_word_flag
245 };
246
248 ret = OBP_REGISTER_FTPARSER(plugin,
249 "example_ftparser",
250 parser,
251 "This is an example ftparser.");
252 return ret;
253}
254
255OBP_DECLARE_PLUGIN(example_ftparser)
256{
257 OBP_AUTHOR_OCEANBASE, // 作者
258 OBP_MAKE_VERSION(1, 0, 0), // 当前插件库的版本
259 OBP_LICENSE_MULAN_PSL_V2, // 该插件的license
260 plugin_init, // init // 插件的初始化函数,在plugin_init中注册各个插件功能
261 nullptr, // deinit // 插件的析构函数
263
int get_next_token(const char *&word, int64_t &word_len, int64_t &char_len, int64_t &word_freq)
int ftparser_scan_end(ObPluginFTParserParamPtr param)
int ftparser_next_token(ObPluginFTParserParamPtr param, char **word, int64_t *word_len, int64_t *char_cnt, int64_t *word_freq)
int plugin_init(ObPluginParamPtr plugin)
plugin init function
int ftparser_get_add_word_flag(uint64_t *flag)
int ftparser_scan_begin(ObPluginFTParserParamPtr param)
OBP_PUBLIC_API ObPluginCharsetInfoPtr obp_ftparser_charset_info(ObPluginFTParserParamPtr param)
OBP_PUBLIC_API void obp_ftparser_set_user_data(ObPluginFTParserParamPtr param, ObPluginDatum user_data)
set user data
OBP_PUBLIC_API ObPluginDatum obp_ftparser_user_data(ObPluginFTParserParamPtr param)
The user data of fulltext parameter
#define OBP_REGISTER_FTPARSER(param, name, descriptor, description)
OBP_PUBLIC_API const char * obp_ftparser_fulltext(ObPluginFTParserParamPtr param)
the fulltext is the text you should split it to tokens @NOTE the fulltext is not terminated by '\0'.
OBP_PUBLIC_API int64_t obp_ftparser_fulltext_length(ObPluginFTParserParamPtr param)
get the charsetinfo object from param
ObPluginDatum ObPluginFTParserParamPtr
full text parser add word flag
@ OBP_FTPARSER_AWF_STOPWORD
convert characters from uppercase to lowercase.
@ OBP_FTPARSER_AWF_GROUPBY_WORD
@ OBP_FTPARSER_AWF_MIN_MAX_WORD
filter by sotp word table.
@ OBP_FTPARSER_AWF_CASEDOWN
distinct and word aggregation
#define OBP_LOG_INFO(fmt, args...)
#define OBP_LOG_WARN(fmt, args...)
const int OBP_SUCCESS
this is the adaptor errno of oceanbase errno
OBP_PUBLIC_API int obp_charset_ctype(ObPluginCharsetInfoPtr cs, int *ctype, const unsigned char *str, const unsigned char *end)
Get the ctype of the char
const int OBP_INVALID_ARGUMENT
#define OBP_LOG_TRACE(fmt, args...)
const int OBP_ITER_END
const int OBP_INIT_TWICE
ObPluginDatum ObPluginCharsetInfoPtr
const int OBP_PLUGIN_ERROR
#define OBP_AUTHOR_OCEANBASE
#define OBP_MAKE_VERSION(major, minor, patch)
#define OBP_LICENSE_MULAN_PSL_V2
ObPluginDatum ObPluginParamPtr
@NOTE all API should be declared as C interface
#define OBP_DECLARE_PLUGIN(name)
this is used to define a plugin
void * ObPluginDatum
Used for param type
#define true_word_char(ctype, character)
fulltext parser descriptor interface for domain index splitting a document into many tokenizations....