本节介绍了PostgreSQL在指定执行排序规则collate时的实现逻辑.
在指定collate为zh_CN时,排序规则与默认的C大不一致
[local]:5432 pg12@testdb=# SELECT name FROM unnest(ARRAY['MYNAME', 'my-image.jpg', 'my-third-image.jpg']) name ORDER BY name collate "C";
name
--------------------
MYNAME
my-image.jpg
my-third-image.jpg
(3 rows)
Time: 78.843 ms
[local]:5432 pg12@testdb=# SELECT name FROM unnest(ARRAY['MYNAME', 'my-image.jpg', 'my-third-image.jpg']) name ORDER BY name collate "zh_CN";
name
--------------------
my-image.jpg
MYNAME
my-third-image.jpg
(3 rows)
Time: 70.125 ms
一、数据结构
VarStringSortSupport
变长字符串排序支持
typedef struct
{
char *buf1;
char *buf2;
int buflen1;
int buflen2;
int last_len1;
int last_len2;
int last_returned;
bool cache_blob;
bool collate_c;
Oid typid;
hyperLogLogState abbr_card;
hyperLogLogState full_card;
double prop_card;
pg_locale_t locale;
} VarStringSortSupport;
pg_locale_t
PG自定义的locale包装器
#define locale_t _locale_t
struct pg_locale_struct
{
char provider;
bool deterministic;
union
{
#ifdef HAVE_LOCALE_T
locale_t lt;
#endif
#ifdef USE_ICU
struct
{
const char *locale;
UCollator *ucol;
} icu;
#endif
int dummy;
} info;
};
typedef struct pg_locale_struct *pg_locale_t;
二、源码解读
varstrfastcmp_locale函数用于locale定制化排序实现,主要的实现函数是strcoll_l,该函数是C库函数,如collate设置为zh_CN,则使用拼音进行排序,不区分大小写.
static int
varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
{
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
int result;
bool arg1_match;
if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
{
return 0;
}
if (sss->typid == BPCHAROID)
{
len1 = bpchartruelen(a1p, len1);
len2 = bpchartruelen(a2p, len2);
}
if (len1 >= sss->buflen1)
{
pfree(sss->buf1);
sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
}
if (len2 >= sss->buflen2)
{
pfree(sss->buf2);
sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
}
//拷贝到sss的buf1中
arg1_match = true;
if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
{
arg1_match = false;
memcpy(sss->buf1, a1p, len1);
sss->buf1[len1] = '\0';
sss->last_len1 = len1;
}
//拷贝到sss的buf2中
if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
{
memcpy(sss->buf2, a2p, len2);
sss->buf2[len2] = '\0';
sss->last_len2 = len2;
}
else if (arg1_match && !sss->cache_blob)
{
return sss->last_returned;
}
if (sss->locale)
{
//设置了locale
if (sss->locale->provider == COLLPROVIDER_ICU)
{
#ifdef USE_ICU
#ifdef HAVE_UCOL_STRCOLLUTF8
if (GetDatabaseEncoding() == PG_UTF8)
{
UErrorCode status;
status = U_ZERO_ERROR;
result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
a1p, len1,
a2p, len2,
&status);
if (U_FAILURE(status))
ereport(ERROR,
(errmsg("collation failed: %s", u_errorName(status))));
}
else
#endif
{
int32_t ulen1,
ulen2;
UChar *uchar1,
*uchar2;
ulen1 = icu_to_uchar(&uchar1, a1p, len1);
ulen2 = icu_to_uchar(&uchar2, a2p, len2);
result = ucol_strcoll(sss->locale->info.icu.ucol,
uchar1, ulen1,
uchar2, ulen2);
pfree(uchar1);
pfree(uchar2);
}
#else
elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
#endif
}
else
{
#ifdef HAVE_LOCALE_T
//调用库函数strcoll_l
result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
#else
elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
#endif
}
}
else
//没有设置locale,调用库函数strcoll
result = strcoll(sss->buf1, sss->buf2);
if (result == 0 &&
(!sss->locale || sss->locale->deterministic))
result = strcmp(sss->buf1, sss->buf2);
sss->cache_blob = false;
sss->last_returned = result;
return result;
}
strcoll_l.c
#include <assert.h>
#include <langinfo.h>
#include <locale.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <sys/param.h>
#include <libc-diag.h>
#ifndef STRING_TYPE
# define STRING_TYPE char
# define USTRING_TYPE unsigned char
# define STRCOLL __strcoll_l
# define STRCMP strcmp
# define WEIGHT_H "../locale/weight.h"
# define SUFFIX MB
# define L(arg) arg
#endif
#define CONCAT(a,b) CONCAT1(a,b)
#define CONCAT1(a,b) a##b
#include "../locale/localeinfo.h"
#include WEIGHT_H
typedef struct
{
int len;
size_t val;
size_t idxmax;
size_t idxcnt;
size_t backw;
size_t backw_stop;
const USTRING_TYPE *us;
unsigned char rule;
int32_t idx;
int32_t save_idx;
const USTRING_TYPE *back_us;
} coll_seq;
static __always_inline void
get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
const USTRING_TYPE *weights, const int32_t *table,
const USTRING_TYPE *extra, const int32_t *indirect,
int pass)
{
size_t val = seq->val = 0;
int len = seq->len;
size_t backw_stop = seq->backw_stop;
size_t backw = seq->backw;
size_t idxcnt = seq->idxcnt;
size_t idxmax = seq->idxmax;
int32_t idx = seq->idx;
const USTRING_TYPE *us = seq->us;
while (len == 0)
{
++val;
if (backw_stop != ~0ul)
{
if (backw == backw_stop)
{
if (idxcnt < idxmax)
{
idx = seq->save_idx;
backw_stop = ~0ul;
}
else
{
idx = 0;
break;
}
}
else
{
size_t i = backw_stop;
us = seq->back_us;
while (i < backw)
{
int32_t tmp = findidx (table, indirect, extra, &us, -1);
idx = tmp & 0xffffff;
i++;
}
--backw;
us = seq->us;
}
}
else
{
backw_stop = idxmax;
int32_t prev_idx = idx;
while (*us != L('\0'))
{
int32_t tmp = findidx (table, indirect, extra, &us, -1);
unsigned char rule = tmp >> 24;
prev_idx = idx;
idx = tmp & 0xffffff;
idxcnt = idxmax++;
if (__glibc_unlikely (idxcnt == 0))
seq->rule = rule;
if ((rulesets[rule * nrules + pass]
& sort_backward) == 0)
break;
++idxcnt;
}
if (backw_stop >= idxcnt)
{
if (idxcnt == idxmax || backw_stop > idxcnt)
break;
backw_stop = ~0ul;
}
else
{
seq->back_us = seq->us;
seq->us = us;
backw = idxcnt;
if (idxmax > idxcnt)
{
backw--;
seq->save_idx = idx;
idx = prev_idx;
}
if (backw > backw_stop)
backw--;
}
}
DIAG_PUSH_NEEDS_COMMENT;
DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
len = weights[idx++];
DIAG_POP_NEEDS_COMMENT;
for (int i = 0; i < pass; i++)
{
idx += len;
len = weights[idx];
idx++;
}
}
seq->val = val;
seq->len = len;
seq->backw_stop = backw_stop;
seq->backw = backw;
seq->idxcnt = idxcnt;
seq->idxmax = idxmax;
seq->us = us;
seq->idx = idx;
}
static __always_inline int
do_compare (coll_seq *seq1, coll_seq *seq2, int position,
const USTRING_TYPE *weights)
{
int seq1len = seq1->len;
int seq2len = seq2->len;
size_t val1 = seq1->val;
size_t val2 = seq2->val;
int idx1 = seq1->idx;
int idx2 = seq2->idx;
int result = 0;
if (position && val1 != val2)
{
result = val1 > val2 ? 1 : -1;
goto out;
}
do
{
if (weights[idx1] != weights[idx2])
{
result = weights[idx1] - weights[idx2];
goto out;
}
++idx1;
++idx2;
--seq1len;
--seq2len;
}
while (seq1len > 0 && seq2len > 0);
if (position && seq1len != seq2len)
result = seq1len - seq2len;
out:
seq1->len = seq1len;
seq2->len = seq2len;
seq1->idx = idx1;
seq2->idx = idx2;
return result;
}
int
STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, locale_t l)
{
struct __locale_data *current = l->__locales[LC_COLLATE];
uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
const unsigned char *rulesets;
const int32_t *table;
const USTRING_TYPE *weights;
const USTRING_TYPE *extra;
const int32_t *indirect;
if (nrules == 0)
return STRCMP (s1, s2);
if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
return (*s1 != '\0') - (*s2 != '\0');
rulesets = (const unsigned char *)
current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
table = (const int32_t *)
current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
weights = (const USTRING_TYPE *)
current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
extra = (const USTRING_TYPE *)
current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
indirect = (const int32_t *)
current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
int result = 0, rule = 0;
DIAG_PUSH_NEEDS_COMMENT;
DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized");
coll_seq seq1, seq2;
DIAG_POP_NEEDS_COMMENT;
seq1.len = 0;
seq1.idxmax = 0;
seq1.rule = 0;
seq2.len = 0;
seq2.idxmax = 0;
for (int pass = 0; pass < nrules; ++pass)
{
seq1.idxcnt = 0;
seq1.idx = 0;
seq2.idx = 0;
seq1.backw_stop = ~0ul;
seq1.backw = ~0ul;
seq2.idxcnt = 0;
seq2.backw_stop = ~0ul;
seq2.backw = ~0ul;
seq1.us = (const USTRING_TYPE *) s1;
seq2.us = (const USTRING_TYPE *) s2;
int position = rulesets[rule * nrules + pass] & sort_position;
while (1)
{
get_next_seq (&seq1, nrules, rulesets, weights, table,
extra, indirect, pass);
get_next_seq (&seq2, nrules, rulesets, weights, table,
extra, indirect, pass);
if (seq1.len == 0 || seq2.len == 0)
{
if (seq1.len == seq2.len)
{
if (pass == 0 && STRCMP (s1, s2) == 0)
return result;
else
break;
}
return seq1.len == 0 ? -1 : 1;
}
result = do_compare (&seq1, &seq2, position, weights);
if (result != 0)
return result;
}
rule = seq1.rule;
}
return result;
}
libc_hidden_def (STRCOLL)
#ifndef WIDE_CHAR_VERSION
weak_alias (__strcoll_l, strcoll_l)
#endif
三、跟踪分析
查询SQL
[local]:5432 pg12@testdb=# SELECT name FROM unnest(ARRAY['MYNAME', 'my-image.jpg', 'my-third-image.jpg']) name ORDER BY name collate "zh_CN";
name
--------------------
my-image.jpg
MYNAME
my-third-image.jpg
(3 rows)
Time: 70.125 ms
gdb跟踪
(gdb) b varstrfastcmp_locale
Breakpoint 1 at 0xa30df7: file varlena.c, line 2244.
(gdb) c
Continuing.
Breakpoint 1, varstrfastcmp_locale (a1p=0x17bbe69 "MYNAME\017MYNAME~\177@", len1=6,
a2p=0x17bbea1 "my-image.jpg\033my-image.jpg~", '\177' <repeats 21 times>, "@", len2=12, ssup=0x17ba260)
at varlena.c:2244
2244 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
(gdb) n
2249 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
(gdb) p *sss
$1 = {buf1 = 0x17ba3d0 '\177' <repeats 200 times>..., buf2 = 0x17ba7e8 '\177' <repeats 200 times>..., buflen1 = 1024,
buflen2 = 1024, last_len1 = -1, last_len2 = -1, last_returned = 0, cache_blob = true, collate_c = false, typid = 25,
abbr_card = {registerWidth = 127 '\177', nRegisters = 9187201950435737471, alphaMM = 1.3824172084878715e+306,
hashesArr = 0x7f7f7f7f7f7f7f7f <Address 0x7f7f7f7f7f7f7f7f out of bounds>, arrSize = 9187201950435737471}, full_card = {
registerWidth = 127 '\177', nRegisters = 9187201950435737471, alphaMM = 1.3824172084878715e+306,
hashesArr = 0x7f7f7f7f7f7f7f7f <Address 0x7f7f7f7f7f7f7f7f out of bounds>, arrSize = 9187201950435737471},
prop_card = 1.3824172084878715e+306, locale = 0x1717d90}
(gdb) n
2267 if (sss->typid == BPCHAROID)
(gdb)
2274 if (len1 >= sss->buflen1)
(gdb)
2280 if (len2 >= sss->buflen2)
(gdb)
2297 arg1_match = true;
(gdb)
2298 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
(gdb)
2300 arg1_match = false;
(gdb)
2301 memcpy(sss->buf1, a1p, len1);
(gdb)
2302 sss->buf1[len1] = '\0';
(gdb)
2303 sss->last_len1 = len1;
(gdb)
2312 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
(gdb)
2314 memcpy(sss->buf2, a2p, len2);
(gdb)
2315 sss->buf2[len2] = '\0';
(gdb)
2316 sss->last_len2 = len2;
(gdb)
2324 if (sss->locale)
(gdb)
2326 if (sss->locale->provider == COLLPROVIDER_ICU)
(gdb) p sss->locale
$2 = (pg_locale_t) 0x1717d90
(gdb) p *sss->locale
$3 = {provider = 99 'c', deterministic = true, info = {lt = 0x1707220, dummy = 24146464}}
(gdb) n
2369 result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
(gdb)
2380 if (result == 0 &&
(gdb) p result
$4 = 5
(gdb) n
2385 sss->cache_blob = false;
(gdb)
2386 sss->last_returned = result;
(gdb)
2387 return result;
(gdb) p result
$5 = 5
(gdb) p *sss->locale->info.lt
$6 = {__locales = {0x1706ce0, 0x7f115a2cc260 <_nl_C_LC_NUMERIC>, 0x7f115a2cc2e0 <_nl_C_LC_TIME>, 0x17a78b0,
0x7f115a2cc0a0 <_nl_C_LC_MONETARY>, 0x7f115a2cc020 <_nl_C_LC_MESSAGES>, 0x0, 0x7f115a2cc6a0 <_nl_C_LC_PAPER>,
0x7f115a2cc700 <_nl_C_LC_NAME>, 0x7f115a2cc780 <_nl_C_LC_ADDRESS>, 0x7f115a2cc840 <_nl_C_LC_TELEPHONE>,
0x7f115a2cc8c0 <_nl_C_LC_MEASUREMENT>, 0x7f115a2cc920 <_nl_C_LC_IDENTIFICATION>}, __ctype_b = 0x7f115410e0e0,
__ctype_tolower = 0x7f115410eae0, __ctype_toupper = 0x7f115410e4e0, __names = {0x1707308 "zh_CN.utf8",
0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C", 0x1707313 "zh_CN.utf8",
0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C",
0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C",
0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C", 0x7f115a090fd5 <_nl_C_name> "C"}}
(gdb)
四、参考资料
N/A
免责声明:
① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的,并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据,供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。
② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
软考中级精品资料免费领
- 历年真题答案解析
- 备考技巧名师总结
- 高频考点精准押题
- 资料下载
- 历年真题
193.9 KB下载数265
191.63 KB下载数245
143.91 KB下载数1148
183.71 KB下载数642
644.84 KB下载数2756
相关文章
发现更多好内容猜你喜欢
AI推送时光机PostgreSQL 源码解读(202)- 查询#115(类型转换)
数据库2024-04-02
PostgreSQL 源码解读(17)- 查询语句#2(查询优化基础)
数据库2024-04-02
PostgreSQL 源码解读(42)- 查询语句#27(等价类)
数据库2024-04-02
咦!没有更多了?去看看其它编程学习网 内容吧