0.02秒找到最优解的华容道程序
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <set>
#include "cwisstable.h"
const char* NM[] = { {"曹","贼","真","爽"}, {"西","施"}, {"昭","君"}, {"貂","蝉"}, {"甄","姬"}, {"玉","环"}, {"美"}, {"美"}, {"美"}, {"美"} };
int W[] = { 2, 2, 2, 2, 2, 2, 1, 1, 1, 1 }; // 默认5个水平条,随后修改
int H[] = { 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
int T; // Type
const int D[] = { {0, 1}, {0, -1}, {-1, 0}, {1, 0} }; // 后面有两处用下标判断移动方向
enum { MAX = 3600 * 10000 };
struct State {
uint8_t aa;
#define CCY aa // 曹操的y
int p; // 局面路径的previous
#define cpy20(dst, src) _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src)); *(int*)((uint8_t*)dst+16) = *(int*)((const uint8_t*)src+16)
void operator= (const State& s) { cpy20(aa, s.aa); p = s.p; }
void operator=(const char* s);
void print(const char* s = "");
} q; // 最多40个move. MAX很大; 没判断qt < MAX
int qh, qt = 1; // queue head, tail
void State::operator= (const char* s) {
int p = 6;
for (int x = 3; x >= 0; x--)
for (int y = 4; y >= 0; y--) {
#define CASE(c, i) case c: aa = x; aa = y; break;
switch (s) {
// 曹关张黄子龙(l)马
CASE('c', 0) CASE('g', 1) CASE('z', 2) CASE('h', 3) CASE('l', 4) CASE('m', 5)
case 'p':
aa = x; aa = y; break; // pawn
}
}
static const char*S[] = { "gg", "zz", "hh", "ll", "mm" };
for (int i = 0; i < 5; i++) if (strstr(s, S)) W = 1, H = 2;
for (int i = 0; i < 10; i++) T = W * 2 + H - 2;
}
#define set2DArrayByCoordInA(a, b, what) for (int i = 0; i < 10; i++) { \
const int x = a, y = a; \
for (int yy = y; yy < y + H; yy++) \
for (int xx = x; xx < x + W; xx++) b = what; \
}
void State::print (const char* s) {
int idx = {};
const char* b = {};
set2DArrayByCoordInA(aa, b, NM++])
for (int y = 0; y < 5; y++) {
for (int x = 0; x < 4; x++) printf("%s", b ? : " ");
puts("");
}
printf("%s\n", s);
}
void print_path () { // 数组存的单链表就地翻转
int prev = -1, next = -1, n = 0;
for (int cur = qh; cur != -1; ++n) {
next = q.p; q.p = prev;
prev = cur; cur = next;
}
for (int p = 0; p != -1; p = q.p) q.print();
printf("%d\n", n);
}
#pragma pack(1)
struct {
uint8_t b;
uint64_t n; // unique number
uint8_t _;
} u __attribute__((aligned(32)));
#pragma pack()
#define Ah { \
_mm256_store_si256((__m256i*)&u, _mm256_setzero_si256()); \
set2DArrayByCoordInA(a, u.b, T) \
for (int y = 0; y < 5; y++) \
for (int x = 0; x < 4; x++) u.n = u.n * 5 + u.b; \
}
#ifdef st
CWISS_DECLARE_FLAT_HASHSET(Set, uint64_t); Set set = Set_new(MAX);
#define if_ok_add_state if (ok) { cpy20(q.aa, a); \
Ah if (!Set_contains(&set, &u.n)) { Set_insert(&set, &u.n); q.p = qh; } \
}
#else
std::set<uint64_t> set;
#define if_ok_add_state if (ok) { cpy20(q.aa, a); \
Ah if (set.find(u.n) == set.end()) { set.insert(u.n); q.p = qh; } \
}
#endif
int main (int argc, char* argv[]) {
//q = "pp zz""ccghh""ccgll""pp mm";
q = (argc == 2) ? argv : "pzzpg""ccg""ccllm""phhpm";
q.p = -1;
uint8_t a __attribute__((aligned(32)));
cpy20(a, q.aa);
#ifdef st
Ah Set_insert(&set, &u.n);
#else
Ah set.insert(u.n);
#endif
double tm = clock();
for (; qh < qt; qh++) {
if (q.CCY == 3) { print_path(); break; }
cpy20(a, q.aa);
uint8_t b __attribute__((aligned(32))), overflow;
_mm256_store_si256((__m256i*)b, _mm256_set1_epi8(1));
set2DArrayByCoordInA(a, b, 0)
int qt1 = qt;
for (int j = 0; j < 4; j++) {
int ok;
for (int i = 0; i < 6; i++) {
const int ox = a, oy = a;
int x = (a += D), y = (a += D);
if (x < 0 || x + W > 4 || y < 0 || y + H > 5) ok = 0;
else { // D[]的顺序不能换!
if (j == 0) { y = oy + H; ok = b && (W == 1 || b); }
else if (j == 1) ok = b & (!(W - 1) | b); // W1或2; 不更快
else if (j == 2) ok = b & (!(H - 1) | b);
else { x = ox + W; ok = b && (H == 1 || b); }
}
if_ok_add_state
a -= D; a -= D;
}
for (int i = 6; i < 10; i++) {
const int ox = a, oy = a;
int x = (a += D), y = (a += D);
if (x < 0 || x + W > 4 || y < 0 || y + H > 5) ok = 0;
else {
if (j == 0) y = oy + H;
else if (j == 3) x = ox + W;
ok = b;
}
if_ok_add_state
a -= D; a -= D;
}
}
#if 1
int qt2 = qt - 1;
if (qt2 > qt1 && q.CCY < q.CCY) { State t = q; q = q; q = t; }
#endif
}
printf("%.6f %d\n", (clock() - tm) / CLOCKS_PER_SEC, qt);
return 0;
}有个负优化而不是bug. q.CCY < q.CCY的目的是优先搜索曹操靠下的局面。下方的y大,< 应为 >
既然优先把所有的块往下挪,为啥会出现q.CCY < q.CCY的情形呢?
当前局面是最左边时,程序确实优先往下挪,可挪不动。有2个合法移动:
美 美 美曹贼美 美 美
美曹贼貂 美真爽貂 美曹贼貂
昭真爽蝉 昭 蝉 昭真爽蝉
君甄玉环 君甄玉环 君甄玉环
美姬西施 美姬西施 美姬西施
我们是方向优先,且优先往下(前提是能挪动),块则是曹操第一,往下挪不动往上挪了。
qt1=11788 qt2=11789 qt=11790 (尚未使用)
把,和#if 0后发现,这个启发信息没用,速度一样(起码差别微乎其微)。
这个程序速度很快。以上结果是在Intel N100上跑的。注释掉找到局面则退出的代码,0.081639秒内搜索了81,340个局面,判断出无解。
On a SUN Sparcstation 2, the timings increase (at a rate of 0.005 second/pixel) from about 20 seconds for a 64**2 pixel image to about 320 seconds for 256**2 pixels. 〔详情〕
它功能强,支持各种块形状。我觉得华容道太难了,并不手动玩,也许可以改成3个长条6个小方块之类。
我alias gcc='g++ -mavx2'。Ah(q.aa)美吗?还是改成Emm?
“有2个合法移动”——不对啊,待查,结果:qh=10933 反正我0.02秒,不SEGV,不管啦!:-)
〔优化历史〕
AI写的五进制转换程序:
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
// SSE优化版本
uint64_t convert_to_quinary_sse(uint8_t b)
{
uint64_t n = 0;
// 处理前16个元素(4行)使用SSE
for (int y = 0; y < 4; y++) {
// 加载4个uint8_t值到128位寄存器
__m128i row = _mm_loadu_si128((__m128i *) b);
// 将uint8_t转换为uint64_t并进行五进制转换
for (int x = 0; x < 4; x++) {
uint8_t value = b;
n = n * 5 + value;
}
}
// 处理最后一行(第5行)
for (int x = 0; x < 4; x++) {
n = n * 5 + b;
}
return n;
}
// AVX2优化版本(处理所有20个元素)
uint64_t convert_to_quinary_avx2(uint8_t b)
{
uint64_t n = 0;
// 使用AVX2处理前16个元素
for (int y = 0; y < 4; y++) {
// 加载4个uint8_t值
__m128i row = _mm_loadu_si128((__m128i *) b);
// 解包到32位整数
__m128i zero = _mm_setzero_si128();
__m128i row32 = _mm_cvtepu8_epi32(row);
// 对每个元素进行五进制转换
uint32_t values;
_mm_storeu_si128((__m128i *) values, row32);
for (int i = 0; i < 4; i++) {
n = n * 5 + values;
}
}
// 处理最后4个元素
for (int x = 0; x < 4; x++) {
n = n * 5 + b;
}
return n;
}
// 高度优化的AVX版本,使用并行计算
uint64_t convert_to_quinary_avx_parallel(uint8_t b)
{
// 使用多个累加器并行计算
uint64_t n1 = 0, n2 = 0, n3 = 0, n4 = 0;
// 并行处理4行
for (int y = 0; y < 4; y++) {
// 加载当前行
__m128i row = _mm_loadu_si128((__m128i *) b);
// 转换为32位整数
__m128i row32 = _mm_cvtepu8_epi32(row);
uint32_t values;
_mm_storeu_si128((__m128i *) values, row32);
// 并行累加(模拟并行处理)
n1 = n1 * 625 + values * 125 + values * 25 + values * 5 + values;
}
// 合并结果并添加最后一行
uint64_t final_n = n1;
for (int x = 0; x < 4; x++) {
final_n = final_n * 5 + b;
}
return final_n;
}
// 基准版本(原始代码)
uint64_t convert_to_quinary_baseline(uint8_t b)
{
uint64_t n = 0;
for (int y = 0; y < 5; y++) {
for (int x = 0; x < 4; x++) {
n = n * 5 + b;
}
}
return n;
}
int main()
{
uint8_t b = {
{ 1, 2, 3, 4 },
{ 0, 4, 2, 1 },
{ 3, 1, 4, 2 },
{ 2, 0, 1, 3 },
{ 4, 3, 0, 2 }
};
uint64_t result_baseline = convert_to_quinary_baseline(b);
uint64_t result_sse = convert_to_quinary_sse(b);
uint64_t result_avx2 = convert_to_quinary_avx2(b);
uint64_t result_avx_parallel = convert_to_quinary_avx_parallel(b);
printf("基准版本结果: %lu\n", result_baseline);
printf("SSE优化结果:%lu\n", result_sse);
printf("AVX2优化结果: %lu\n", result_avx2);
printf("AVX并行结果: %lu\n", result_avx_parallel);
if (result_baseline == result_sse &&
result_sse == result_avx2 && result_avx2 == result_avx_parallel) {
printf("✓ 所有版本结果一致\n");
} else
printf("✗ 结果不一致\n");
return 0;
}View Code
来源:程序园用户自行投稿发布,如果侵权,请联系站长删除
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作! 谢谢分享,试用一下
页:
[1]