From c36035c0667fdd224da914b50c30a9366e1a5c38 Mon Sep 17 00:00:00 2001 From: Dennis Kobert Date: Tue, 7 Jan 2020 03:12:55 +0100 Subject: Fix global worgsize calculation --- src/main.rs | 8 ++--- src/solvers/check.cl | 29 ++++++++++------- src/solvers/gpusolver.rs | 20 +++++++----- src/solvers/intuitive.rs | 83 ++++++++++++++++++------------------------------ src/solvers/opencl.rs | 75 +++++++++++++++++++++++++++++++------------ 5 files changed, 119 insertions(+), 96 deletions(-) diff --git a/src/main.rs b/src/main.rs index fdf3fcd..f9775ed 100755 --- a/src/main.rs +++ b/src/main.rs @@ -6,11 +6,11 @@ use crate::solver::{IteratorSolver, Solver}; pub static N: u32 = 8; fn main() { - //let mut solver = solvers::intuitive::NormalSolver::new(N); - //solver.solve(); - let solver = solvers::gpusolver::GpuSolver::new(N); + let mut solver = solvers::intuitive::NormalSolver::new(N); + solver.solve(); + /*let solver = solvers::gpusolver::GpuSolver::new(N); println!("solver: {:?}", solver); for (i, solution) in solver.solve().enumerate() { println!("{}: {:?}", i, solution); - } + }*/ } diff --git a/src/solvers/check.cl b/src/solvers/check.cl index 15069b9..af9bc51 100644 --- a/src/solvers/check.cl +++ b/src/solvers/check.cl @@ -3,36 +3,41 @@ typedef ulong u64; typedef uint u32; __kernel void check(__global u64* permutations, __global u64* results, - __local u64* instructions, __local u64* i_buffer, u32 n, u32 w, u64 offset) { - int gid = get_global_id(0); + __global u64* instructions, __local u64* i_buffer, u32 n, u32 w, u64 offset) { int wid = get_local_id(0); + i_buffer[wid] = instructions[wid]; + barrier(CLK_LOCAL_MEM_FENCE); + + int gid = get_global_id(0); int gsize = get_global_size(0); int wsize = get_local_size(0); + u32 w64 = wsize / 64 + !!(wsize & 63); + u64 result = 0; + u64 own = permutations[gid + offset]; + u64 correct_mask = (1 << (n + 1)) - 2; - i_buffer[wid] = instructions[wid]; - barrier(CLK_LOCAL_MEM_FENCE); - u64 result = 0; - unsigned long own = permutations[gid + offset]; - for (int j = 0; j < wsize; j++) { + int j; + for (j = 0; j < wsize; j++) { unsigned long curr_mask = i_buffer[j] | own; unsigned long tmask, sum, stones; stones = tmask = sum = 0; for (int i = 1; i <= w + 1; i++) { - if ((curr_mask & (1 << i)) == 0) { - stones += 1; + if (!(curr_mask & (1 << i))) { + stones++; tmask |= 1 << (i - sum); sum = i; } } - if (tmask == (1 << (n + 1)) - 2 && stones == n) { + if (tmask == correct_mask && stones == n) { result |= 1 << (j & 63); } - if (j & !(j & 63)) { - results[gid * (wsize / 64) + j / 64] = result; + if (j % 64 == 0 && j) { + results[gid * w64 + j / 64] = result; result = 0; } } + results[gid * w64 + j / 64] = result; } diff --git a/src/solvers/gpusolver.rs b/src/solvers/gpusolver.rs index 69c400e..3a96568 100644 --- a/src/solvers/gpusolver.rs +++ b/src/solvers/gpusolver.rs @@ -1,5 +1,6 @@ use crate::permutations::PermutationGenerator; use crate::solver::{wall_stats, IteratorSolver, Solver}; +use crate::solvers::opencl; use crate::structs::StoneWall; #[derive(Debug)] @@ -16,14 +17,17 @@ impl GpuSolver { let src = std::fs::read_to_string("src/solvers/check.cl").expect("failed to open kernel file"); - crate::solvers::opencl::GpuSolver::launch_sevice( - &self.masks, - self.n, - self.h, - self.w, - src.as_ref(), - ) - .unwrap(); + let senders = + opencl::GpuSolver::launch_sevice(&self.masks, self.n, self.h, self.w, 4, src.as_ref()) + .unwrap(); + for i in 0..12 { + senders[1 - i / 6] + .send(opencl::Job::new(vec![i as u32], self.masks[i])) + .unwrap(); + } + loop { + std::thread::sleep(std::time::Duration::from_secs(5)); + } vec![] } } diff --git a/src/solvers/intuitive.rs b/src/solvers/intuitive.rs index 3db1d33..bb23a39 100755 --- a/src/solvers/intuitive.rs +++ b/src/solvers/intuitive.rs @@ -13,6 +13,7 @@ pub struct NormalSolver { /// Use to store already used blocks as a bitmask permutations: Vec>, masks: Vec, + senders: Vec>, } static mut TRIES: u32 = 0; @@ -38,6 +39,12 @@ impl NormalSolver { masks[j] |= 1 << sum; } } + + let src = + std::fs::read_to_string("src/solvers/check.cl").expect("failed to open kernel file"); + + let senders = + super::opencl::GpuSolver::launch_sevice(&masks, n, h, w, 0, src.as_ref()).unwrap(); Self { n, h, @@ -46,14 +53,15 @@ impl NormalSolver { mask: (1 << w) - 2, permutations, masks, + senders, } } pub fn solve(&mut self) { for (n, i) in self.permutations.iter().enumerate() { let tmp: Vec = i.iter().map(|x| *x).collect(); - println!("perm {}: {:?}", n, tmp); - println!("perm {}: {:b}", n, self.masks[n]); + //println!("perm {}: {:?}", n, tmp); + //println!("perm {}: {:b}", n, self.masks[n]); } println!("calculate results"); self.permute( @@ -66,58 +74,29 @@ impl NormalSolver { .as_ref(), ); unsafe { println!("tries: {}\nsolutions: {}", TRIES, SOLUTIONS) } + loop { + std::thread::sleep(std::time::Duration::from_secs(5)); + } } fn permute(&self, up: usize, index: usize, curr_mask: u64, numbers: &[u32]) { - if index as usize == numbers.len() { - //println!("checking {:?}", numbers); - unsafe { - TRIES += 1; - } - let mut tmask: u64 = 0; - let mut sum = 0; - let mut stones = 0; - for i in 1..=(self.w + 1) { - if curr_mask & (1 << i) == 0 { - stones += 1; - tmask |= 1 << (i - sum); - sum = i; - } - } - if tmask == (1 << (self.n + 1)) - 2 && stones == self.n { - println!("tmask: {:b}", tmask); - println!("curr: {:b}", curr_mask); - //println!("success"); - unsafe { - SOLUTIONS += 1; - } - for i in numbers { - println!("{}\t{}", numbers[0], i); - } - } - return; - } let mut new_num = Vec::from(numbers); let start = numbers[index as usize] / self.chunk; if index as usize == numbers.len() - 1 { - #[cfg(feature = "gpu")] - { - crate::solvers::opencl::check( - self.masks.as_ref(), - self.w, - self.n, - curr_mask, - (start * self.chunk) as usize, - ) + //#[cfg(feature = "gpu")] + //{ + let i = self.n - 2 - numbers[index] / self.chunk; + self.senders[i as usize] + .send(super::opencl::Job::new(new_num, curr_mask)) .unwrap(); - return; - } + return; + //} } for i in start..self.n - (self.h - 1 - index as u32) { for n in 1..(numbers.len() - index) { new_num[n + index] = (n as u32 + i) * self.chunk; } - if index == 0 && false { + /*if index == 0 { (0..self.chunk).into_par_iter().for_each(|j| { let mut new_num = new_num.clone(); let tmp = i * self.chunk + j; @@ -129,17 +108,17 @@ impl NormalSolver { &new_num, ); }); - } else { - for j in 0..self.chunk { - new_num[index] = i * self.chunk + j; - self.permute( - up, - index + 1, - curr_mask | self.masks[new_num[index] as usize], - &new_num, - ); - } + } else {*/ + for j in 0..self.chunk { + new_num[index] = i * self.chunk + j; + self.permute( + up, + index + 1, + curr_mask | self.masks[new_num[index] as usize], + &new_num, + ); } + //} } } } diff --git a/src/solvers/opencl.rs b/src/solvers/opencl.rs index cdedd37..550ace9 100644 --- a/src/solvers/opencl.rs +++ b/src/solvers/opencl.rs @@ -6,6 +6,13 @@ pub struct Job { bitmask: u64, } +impl Job { + pub fn new(rows: Vec, bitmask: u64) -> Self { + Self { rows, bitmask } + } +} + +#[derive(Debug)] pub struct GpuSolver { #[allow(unused)] platform: Platform, @@ -18,11 +25,14 @@ pub struct GpuSolver { n: u32, h: u32, w: u32, + /// Workgroup size, set to 0 for max wg_size: usize, permutations: Buffer, rec_queues: Vec, + walls: Vec>, } +#[derive(Debug)] struct RequestBuffer { mask_buff: Vec, row_buff: Vec>, @@ -33,8 +43,8 @@ struct RequestBuffer { impl RequestBuffer { pub fn new(size: usize, receiver: Receiver) -> Self { RequestBuffer { - mask_buff: Vec::with_capacity(size), - row_buff: Vec::with_capacity(size), + mask_buff: vec![0; size], + row_buff: vec![Vec::new(); size], pointer: 0, receiver, } @@ -85,15 +95,15 @@ impl GpuSolver { .len(permutation_masks.len()) .build()?; - let mut senders = Vec::with_capacity(h as usize); - let mut receivers = Vec::with_capacity(h as usize); + let mut senders = Vec::with_capacity((n - h + 1) as usize); + let mut receivers = Vec::with_capacity((n - h + 1) as usize); let max_wg_size = device.max_wg_size()?; if wg_size == 0 { wg_size = max_wg_size; } else if wg_size > max_wg_size { return Err(ocl::Error::from("invalid workgroup size")); } - for _ in 0..h { + for _ in 0..=(n - h) { let (sx, rx) = std::sync::mpsc::channel(); senders.push(sx); receivers.push(RequestBuffer::new(wg_size, rx)); @@ -111,34 +121,40 @@ impl GpuSolver { wg_size, permutations: buffer, rec_queues: receivers, + walls: Vec::new(), }; - std::thread::spawn(move || { - solver.run(); - }); + std::thread::Builder::new() + .name("GPU Deamon".into()) + .spawn(move || { + solver.run(); + }) + .unwrap(); + println!("started gpu thread"); Ok(senders) } fn get_dim(&self, queue: usize) -> usize { let chunk = self.permutations.len() / self.n as usize; let dim = (queue + 1) * chunk; - if dim % self.wg_size == 0 { - dim - } else { - let dim_small = dim / self.wg_size; - ((dim_small + 1) * chunk) - } + (dim + self.wg_size - 1) / self.wg_size * self.wg_size } fn get_off(&self, queue: usize) -> u64 { let chunk = self.permutations.len() / self.n as usize; - let dim = (queue + 1) * chunk; - ((self.n as usize - queue - 1) * chunk - self.wg_size + dim % self.wg_size) as u64 + let off = self.permutations.len() - chunk - self.get_dim(queue); + off as u64 + } + fn get_res(&self, queue: usize) -> usize { + let dim = self.get_dim(queue); + dim * self.get_res_save_dim() + } + fn get_res_save_dim(&self) -> usize { + (self.wg_size + 63) / 64 } fn run(mut self) -> ! { let queues = self.rec_queues.len(); let mut instruction_buffer = Vec::with_capacity((self.n - self.h) as usize); let mut result_buffer = Vec::with_capacity((self.n - self.h) as usize); - let chunk = self.permutations.len() / self.n as usize; for i in 0..queues { let buffer: Buffer = Buffer::builder() @@ -151,17 +167,21 @@ impl GpuSolver { instruction_buffer.push(buffer); let results: Buffer = Buffer::builder() .queue(self.queue.clone()) - .len(self.wg_size * self.get_dim(i) / 64) + .len(self.get_res(i)) .flags(flags::MEM_READ_WRITE) .build() .unwrap(); result_buffer.push(results); } + println!("finished gpu setup"); for i in (0..self.rec_queues.len()).cycle() { if let Some(buffer) = self.rec_queues[i].read() { instruction_buffer[i].write(buffer).enq().unwrap(); let dim = self.get_dim(i); + //println!("dim: {}", dim); + //println!("off: {}", self.get_off(i)); + //println!("result size: {}", self.get_res_save_dim()); let kernel = Kernel::builder() .program(&self.program) .name("check") @@ -189,7 +209,7 @@ impl GpuSolver { } // (5) Read results from the device into a vector (`::block` not shown): - let mut result = vec![0u64; dim * self.wg_size / 64]; + let mut result = vec![0u64; self.get_res(i)]; result_buffer[i] .cmd() .queue(&self.queue) @@ -197,7 +217,22 @@ impl GpuSolver { .read(&mut result) .enq() .unwrap(); - println!("{:?}", result); + for (j, r) in result.iter().enumerate() { + if j == 0 { + continue; + } + for b in 0..64 { + if r & (1 << b) != 0 { + let permutation = + j / self.get_res_save_dim() + self.get_off(i) as usize; + let instruction = (j % self.get_res_save_dim()) * 64 + b; + let mut wall = self.rec_queues[i].get_rows()[instruction].clone(); + wall.push(permutation as u32); + println!("{:?}", wall); + self.walls.push(wall); + } + } + } } } panic!(); -- cgit v1.2.3-54-g00ecf