bucket sort原理(字串版):
目標是把英文單詞依照第一個英文字的字母順序排列。bucket是籃子,因為ascii字符集只會有8個bit,所以要準備2的8次方個籃子,每個籃子會有一個值(int buckets[]
),代表這個籃子所代表的字,它在output的第幾個開始。
舉個例子吧。假設現在有這些字串要排序:
"foo", "boo", "bar", "qoo", "qar", "baz", "qux", "qaz"
那麼經過一連串的運算後,我應該要得到buckets['q'] == 4,代表排序完以後q開頭的單詞會從鎮列第四個位置開始排。
bucket sort的pseudo code如下:
- 先算出buckets陣列的每一個值,也就是每一個字的開頭位置(void compute_buckets)
- 掃描input,根據buckets[某字]的值對照該input應該擺哪裡,每擺一個就對buckets[某字]+1。
(void sort_strings)
完整程式碼如下:
#include <stdio.h>
void compute_buckets(int n, char *array[n], int buckets[256])
{
for (int i = 0; i < 256; i++) {
buckets[i] = 0;
}
for (int i = 0; i < n; i++) {
unsigned char bucket = (unsigned char)array[i][0];
buckets[bucket]++;
}
int m = n;
for (int i = 256 - 1; i >= 0; i--) {
int count = buckets[i];
buckets[i] = m - count;
m -= count;
}
}
void sort_strings(int n, char *input[n], char *output[n])
{
int buckets[256];
compute_buckets(n, input, buckets);
for (int i = 0; i < n; i++) {
unsigned char bucket = (unsigned char)input[i][0];
int index = buckets[bucket]++;
output[index] = input[i];
}
}
int main(void)
{
char *array[] = {
"foo", "boo", "bar", "qoo", "qar", "baz", "qux", "qaz"
};
int n = sizeof array / sizeof *array;
char *output[n];
sort_strings(n, array, output);
for (int i = 0; i < n; i++) {
printf("%s\n", output[i]);
}
return 0;
}
接下來就可以開始講radix sort,這裡以3位數的排序來舉例。
假設需要排序的數列是: 7,121,8,35,16,44,那麼我們需要10個桶子代表0-9,對個位數、十位數、百位數進行3次的bucket sort。
第一回合如下(個位數):
輸出會是121,44,35,16,7,8(輸出方式是由左到右,由上而下),再拿這個output來進行第二回合:
這樣輸出會是7,8,16,121,35,44
再拿這個output當作input來進行第三回合,可以發現除了121以外的數字都照input順序由上到下排在0的桶子裡面,照由左到右,由上而下的輸出即可完成排序。
問題來了,要怎麼把前一回合輸出的陣列當作下一回合的輸入? 答案是下面這個函式:
void radix_sort(int n, int array[n])
{
// It is *very* unlikely that sizeof an integer is odd, but if
// it is, you need to move the results from helper
// to array. I assume that we have an even number of bytes
// because that is practically always true for int
static_assert(sizeof *array % 2 == 0,
"integer sizes must be powers of two");
// Helper buffer; handle input/output switches
// when bucket sorting
int helper[n];
// For switching between the buffers
int *buffers[] = { array, helper };
int bucket_input = 0;
for (int offset = 0; offset < sizeof *array; offset++) {
bucket_sort(n, offset,
buffers[bucket_input],
buffers[!bucket_input]);
bucket_input = !bucket_input;
}
}
重要!
這裡多出了一個helper陣列,更有趣的是互換機制,是立了一個bucket_input
當作flag,每一回合要嘛0要嘛1,對應int *buffers[] = { array, helper };
的第0個跟第1個,來當作陣input、output切換。
以範例程式來說,要排的是int,我們可以不用個十百千萬這樣照十進位位數做bucket sort,因為int一定是4個byte,所以每次比一個byte比四次,即可得出正確答案。而1個byte有8個bit,所以需要256大小的buckets陣列。
但是負數的問題要處理,處理方式是把數列分成正數塊跟負數塊,再分別比較。具體做法,是排序前要先整理數列,把負數集中在左邊,正數集中在右邊。做法是兩個函式scan_right
跟scan_left
分別回傳最右邊的負數位置跟最左邊的正數位置,再利用swap函數做交換。如此持續直到scan_right
的回傳值大於scan_left
為止。
int *scan_right(int *left, int *right)
{
while (left < right) {
if (*left >= 0) break;
left++;
}
return left;
}
// Both left and right must point to legal addresses
int *scan_left(int *left, int *right)
{
while (left < right) {
if (*right < 0) break;
right--;
}
return right;
}
// Both left and right must point to legal addresses
void swap(int *left, int *right)
{
int i = *left;
*left = *right;
*right = i;
}
int split(int n, int array[n])
{
int *left = array, *right = array + n - 1;
while (left < right) {
left = scan_right(left, right);
right = scan_left(left, right);
swap(left, right);
}
return left - array;
}
而正負數混合數列的排序,其程式碼如下:
void sort_int(int n, int array[n])
{
if (n <= 0) return;
int m = split(n, array);
if (m > 0) {
radix_sort(m, array);
reverse(m, array);
}
if (m < n) {
radix_sort(n - m, array + m);
}
}
完整程式碼:
#include <stdio.h>
#include <assert.h>
void bucket_sort(int n, int offset,
int const input[n], int output[n])
{
int buckets[256];
for (int i = 0; i < 256; i++) {
buckets[i] = 0;
}
for (int i = 0; i < n; i++) {
unsigned char bucket = (input[i] >> 8 * offset) & 0xff;
buckets[bucket]++;
}
int m = n;
for (int i = 256 - 1; i >= 0; i--) {
int count = buckets[i];
buckets[i] = m - count;
m -= count;
}
for (int i = 0; i < n; i++) {
unsigned char bucket = (input[i] >> 8 * offset) & 0xff;
int index = buckets[bucket]++;
output[index] = input[i];
}
}
void radix_sort(int n, int array[n])
{
// It is *very* unlikely that sizeof an integer is odd, but if
// it is, you need to move the results from helper
// to array. I assume that we have an even number of bytes
// because that is practically always true for int
static_assert(sizeof *array % 2 == 0,
"integer sizes must be powers of two");
// Helper buffer; handle input/output switches
// when bucket sorting
int helper[n];
// For switching between the buffers
int *buffers[] = { array, helper };
int bucket_input = 0;
for (int offset = 0; offset < sizeof *array; offset++) {
bucket_sort(n, offset,
buffers[bucket_input],
buffers[!bucket_input]);
bucket_input = !bucket_input;
}
}
// Both left and right must point to legal addresses
int *scan_right(int *left, int *right)
{
while (left < right) {
if (*left >= 0) break;
left++;
}
return left;
}
// Both left and right must point to legal addresses
int *scan_left(int *left, int *right)
{
while (left < right) {
if (*right < 0) break;
right--;
}
return right;
}
// Both left and right must point to legal addresses
void swap(int *left, int *right)
{
int i = *left;
*left = *right;
*right = i;
}
int split(int n, int array[n])
{
int *left = array, *right = array + n - 1;
while (left < right) {
left = scan_right(left, right);
right = scan_left(left, right);
swap(left, right);
}
return left - array;
}
void reverse(int n, int array[n])
{
int *left = array, *right = array + n - 1;
while (left < right) {
swap(left++, right--);
}
}
void sort_int(int n, int array[n])
{
if (n <= 0) return;
int m = split(n, array);
if (m > 0) {
radix_sort(m, array);
reverse(m, array);
}
if (m < n) {
radix_sort(n - m, array + m);
}
}
int main(void)
{
int array[] = { -1, -2, 13, 12, 4, 4200, 13, 6, 14, -3, 42, 13 };
int n = sizeof array / sizeof *array;
radix_sort(n, array);
for (int i = 0; i < n; i++) {
printf("%d ", array[i]);
}
printf("\n");
sort_int(n, array);
for (int i = 0; i < n; i++) {
printf("%d ", array[i]);
}
printf("\n");
return 0;
}