0.02秒找到最优解的华容道程序

擒揭发表于 2025-10-27 14:00:07

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <set>
#include "cwisstable.h"

const char* NM[] = { {"曹","贼","真","爽"}, {"西","施"}, {"昭","君"}, {"貂","蝉"}, {"甄","姬"}, {"玉","环"}, {"美"}, {"美"}, {"美"}, {"美"} };
int W[] = { 2, 2, 2, 2, 2, 2, 1, 1, 1, 1 }; // 默认5个水平条，随后修改
int H[] = { 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
int T; // Type
const int D[] = { {0, 1}, {0, -1}, {-1, 0}, {1, 0} }; // 后面有两处用下标判断移动方向

enum { MAX = 3600 * 10000 };
struct State {
uint8_t aa;
#define CCY aa // 曹操的y
int p; // 局面路径的previous
#define cpy20(dst, src) _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src)); *(int*)((uint8_t*)dst+16) = *(int*)((const uint8_t*)src+16)
void operator= (const State& s) { cpy20(aa, s.aa); p = s.p; }
void operator=(const char* s);
void print(const char* s = "");
} q; // 最多40个move. MAX很大; 没判断qt < MAX
int qh, qt = 1; // queue head, tail

void State::operator= (const char* s) {
int p = 6;
for (int x = 3; x >= 0; x--)
for (int y = 4; y >= 0; y--) {
#define CASE(c, i) case c: aa = x; aa = y; break;
switch (s) {
// 曹关张黄子龙(l)马
CASE('c', 0) CASE('g', 1) CASE('z', 2) CASE('h', 3) CASE('l', 4) CASE('m', 5)
case 'p':
   aa = x; aa = y; break; // pawn
}
}
static const char*S[] = { "gg", "zz", "hh", "ll", "mm" };
for (int i = 0; i < 5; i++) if (strstr(s, S)) W = 1, H = 2;
for (int i = 0; i < 10; i++) T = W * 2 + H - 2;
}

#define set2DArrayByCoordInA(a, b, what) for (int i = 0; i < 10; i++) { \
const int x = a, y = a; \
for (int yy = y; yy < y + H; yy++) \
for (int xx = x; xx < x + W; xx++) b = what; \
}

void State::print (const char* s) {
int idx = {};
const char* b = {};
set2DArrayByCoordInA(aa, b, NM++])
for (int y = 0; y < 5; y++) {
for (int x = 0; x < 4; x++) printf("%s", b ? : "　");
puts("");
}
printf("%s\n", s);
}

void print_path () { // 数组存的单链表就地翻转
int prev = -1, next = -1, n = 0;
for (int cur = qh; cur != -1; ++n) {
next = q.p; q.p = prev;
prev = cur; cur = next;
}
for (int p = 0; p != -1; p = q.p) q.print();
printf("%d\n", n);
}

#pragma pack(1)
struct {
uint8_t b;
uint64_t n; // unique number
uint8_t _;
} u __attribute__((aligned(32)));
#pragma pack()

#define Ah { \
_mm256_store_si256((__m256i*)&u, _mm256_setzero_si256()); \
set2DArrayByCoordInA(a, u.b, T) \
for (int y = 0; y < 5; y++) \
for (int x = 0; x < 4; x++) u.n = u.n * 5 + u.b; \
}

#ifdef st
CWISS_DECLARE_FLAT_HASHSET(Set, uint64_t); Set set = Set_new(MAX);
#define if_ok_add_state if (ok) { cpy20(q.aa, a); \
Ah if (!Set_contains(&set, &u.n)) { Set_insert(&set, &u.n); q.p = qh; } \
}
#else
std::set<uint64_t> set;
#define if_ok_add_state if (ok) { cpy20(q.aa, a); \
Ah if (set.find(u.n) == set.end()) { set.insert(u.n); q.p = qh; } \
}
#endif

int main (int argc, char* argv[]) {
//q = "pp zz""ccghh""ccgll""pp mm";
q = (argc == 2) ? argv : "pzzpg""ccg""ccllm""phhpm";
q.p = -1;

uint8_t a __attribute__((aligned(32)));
cpy20(a, q.aa);
#ifdef st
Ah Set_insert(&set, &u.n);
#else
Ah set.insert(u.n);
#endif

double tm = clock();
for (; qh < qt; qh++) {
if (q.CCY == 3) { print_path(); break; }

cpy20(a, q.aa);
uint8_t b __attribute__((aligned(32))), overflow;
_mm256_store_si256((__m256i*)b, _mm256_set1_epi8(1));
set2DArrayByCoordInA(a, b, 0)

int qt1 = qt;
for (int j = 0; j < 4; j++) {
   int ok;
   for (int i = 0; i < 6; i++) {
   const int ox = a, oy = a;
   int x = (a += D), y = (a += D);
   if (x < 0 || x + W > 4 || y < 0 || y + H > 5) ok = 0;
   else { // D[]的顺序不能换!
      if (j == 0) { y = oy + H; ok = b && (W == 1 || b); }
      else if (j == 1) ok = b & (!(W - 1) | b); // W1或2; 不更快
      else if (j == 2) ok = b & (!(H - 1) | b);
      else { x = ox + W; ok = b && (H == 1 || b); }
   }
   if_ok_add_state
   a -= D; a -= D;
   }
   for (int i = 6; i < 10; i++) {
   const int ox = a, oy = a;
   int x = (a += D), y = (a += D);
   if (x < 0 || x + W > 4 || y < 0 || y + H > 5) ok = 0;
   else {
      if (j == 0) y = oy + H;
      else if (j == 3) x = ox + W;
      ok = b;
   }
   if_ok_add_state
   a -= D; a -= D;
   }
}
#if 1
int qt2 = qt - 1;
if (qt2 > qt1 && q.CCY < q.CCY) { State t = q; q = q; q = t; }
#endif
}

printf("%.6f %d\n", (clock() - tm) / CLOCKS_PER_SEC, qt);
return 0;
}有个负优化而不是bug. q.CCY < q.CCY的目的是优先搜索曹操靠下的局面。下方的y大，< 应为 >
既然优先把所有的块往下挪，为啥会出现q.CCY < q.CCY的情形呢？
当前局面是最左边时，程序确实优先往下挪，可挪不动。有2个合法移动：
美　　美美曹贼美  美　美
美曹贼貂美真爽貂美曹贼貂
昭真爽蝉昭　　蝉昭真爽蝉
君甄玉环君甄玉环  君甄玉环
美姬西施美姬西施  美姬西施
我们是方向优先，且优先往下（前提是能挪动），块则是曹操第一，往下挪不动往上挪了。
qt1=11788 qt2=11789 qt=11790 (尚未使用）
把，和#if 0后发现，这个启发信息没用，速度一样（起码差别微乎其微）。
这个程序速度很快。以上结果是在Intel N100上跑的。注释掉找到局面则退出的代码，0.081639秒内搜索了81,340个局面，判断出无解。
On a SUN Sparcstation 2, the timings increase (at a rate of 0.005 second/pixel) from about 20 seconds for a 64**2 pixel image to about 320 seconds for 256**2 pixels. 〔详情〕
它功能强，支持各种块形状。我觉得华容道太难了，并不手动玩，也许可以改成3个长条6个小方块之类。
我alias gcc='g++ -mavx2'。Ah(q.aa)美吗？还是改成Emm?
“有2个合法移动”——不对啊，待查，结果：qh=10933 反正我0.02秒，不SEGV，不管啦！:-)
〔优化历史〕
AI写的五进制转换程序：
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>

// SSE优化版本
uint64_t convert_to_quinary_sse(uint8_t b)
{
uint64_t n = 0;

// 处理前16个元素（4行）使用SSE
for (int y = 0; y < 4; y++) {
// 加载4个uint8_t值到128位寄存器
__m128i row = _mm_loadu_si128((__m128i *) b);

// 将uint8_t转换为uint64_t并进行五进制转换
for (int x = 0; x < 4; x++) {
   uint8_t value = b;
   n = n * 5 + value;
}
}

// 处理最后一行（第5行）
for (int x = 0; x < 4; x++) {
n = n * 5 + b;
}

return n;
}

// AVX2优化版本（处理所有20个元素）
uint64_t convert_to_quinary_avx2(uint8_t b)
{
uint64_t n = 0;

// 使用AVX2处理前16个元素
for (int y = 0; y < 4; y++) {
// 加载4个uint8_t值
__m128i row = _mm_loadu_si128((__m128i *) b);

// 解包到32位整数
__m128i zero = _mm_setzero_si128();
__m128i row32 = _mm_cvtepu8_epi32(row);

// 对每个元素进行五进制转换
uint32_t values;
_mm_storeu_si128((__m128i *) values, row32);

for (int i = 0; i < 4; i++) {
   n = n * 5 + values;
}
}

// 处理最后4个元素
for (int x = 0; x < 4; x++) {
n = n * 5 + b;
}

return n;
}

// 高度优化的AVX版本，使用并行计算
uint64_t convert_to_quinary_avx_parallel(uint8_t b)
{
// 使用多个累加器并行计算
uint64_t n1 = 0, n2 = 0, n3 = 0, n4 = 0;

// 并行处理4行
for (int y = 0; y < 4; y++) {
// 加载当前行
__m128i row = _mm_loadu_si128((__m128i *) b);

// 转换为32位整数
__m128i row32 = _mm_cvtepu8_epi32(row);

uint32_t values;
_mm_storeu_si128((__m128i *) values, row32);

// 并行累加（模拟并行处理）
n1 = n1 * 625 + values * 125 + values * 25 + values * 5 + values;
}

// 合并结果并添加最后一行
uint64_t final_n = n1;
for (int x = 0; x < 4; x++) {
final_n = final_n * 5 + b;
}

return final_n;
}

// 基准版本（原始代码）
uint64_t convert_to_quinary_baseline(uint8_t b)
{
uint64_t n = 0;
for (int y = 0; y < 5; y++) {
for (int x = 0; x < 4; x++) {
   n = n * 5 + b;
}
}
return n;
}

int main()
{
uint8_t b = {
{ 1, 2, 3, 4 },
{ 0, 4, 2, 1 },
{ 3, 1, 4, 2 },
{ 2, 0, 1, 3 },
{ 4, 3, 0, 2 }
};

uint64_t result_baseline = convert_to_quinary_baseline(b);
uint64_t result_sse = convert_to_quinary_sse(b);
uint64_t result_avx2 = convert_to_quinary_avx2(b);
uint64_t result_avx_parallel = convert_to_quinary_avx_parallel(b);

printf("基准版本结果: %lu\n", result_baseline);
printf("SSE优化结果:%lu\n", result_sse);
printf("AVX2优化结果: %lu\n", result_avx2);
printf("AVX并行结果: %lu\n", result_avx_parallel);

if (result_baseline == result_sse &&
   result_sse == result_avx2 && result_avx2 == result_avx_parallel) {
printf("✓ 所有版本结果一致\n");
} else
printf("✗ 结果不一致\n");
return 0;
}View Code　　

来源：程序园用户自行投稿发布，如果侵权，请联系站长删除
免责声明：如果侵犯了您的权益，请联系站长，我们会及时删除侵权内容，谢谢合作！

挠溃症 发表于 2025-11-29 21:37:58

谢谢分享，试用一下

页: [1]

程序园's Archiver

0.02秒找到最优解的华容道程序