summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDennis Kobert <dennis@kobert.dev>2020-01-07 03:12:55 +0100
committerDennis Kobert <dennis@kobert.dev>2020-01-07 03:12:55 +0100
commitc36035c0667fdd224da914b50c30a9366e1a5c38 (patch)
tree4d67f21f89a138e9859540eb5d2446dec5660aef
parent2750313bab1bff5b94734f949e633f04391577ab (diff)
Fix global worgsize calculation
-rwxr-xr-xsrc/main.rs8
-rw-r--r--src/solvers/check.cl29
-rw-r--r--src/solvers/gpusolver.rs20
-rwxr-xr-xsrc/solvers/intuitive.rs83
-rw-r--r--src/solvers/opencl.rs75
5 files changed, 119 insertions, 96 deletions
diff --git a/src/main.rs b/src/main.rs
index fdf3fcd..f9775ed 100755
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,11 +6,11 @@ use crate::solver::{IteratorSolver, Solver};
pub static N: u32 = 8;
fn main() {
- //let mut solver = solvers::intuitive::NormalSolver::new(N);
- //solver.solve();
- let solver = solvers::gpusolver::GpuSolver::new(N);
+ let mut solver = solvers::intuitive::NormalSolver::new(N);
+ solver.solve();
+ /*let solver = solvers::gpusolver::GpuSolver::new(N);
println!("solver: {:?}", solver);
for (i, solution) in solver.solve().enumerate() {
println!("{}: {:?}", i, solution);
- }
+ }*/
}
diff --git a/src/solvers/check.cl b/src/solvers/check.cl
index 15069b9..af9bc51 100644
--- a/src/solvers/check.cl
+++ b/src/solvers/check.cl
@@ -3,36 +3,41 @@ typedef ulong u64;
typedef uint u32;
__kernel void check(__global u64* permutations, __global u64* results,
- __local u64* instructions, __local u64* i_buffer, u32 n, u32 w, u64 offset) {
- int gid = get_global_id(0);
+ __global u64* instructions, __local u64* i_buffer, u32 n, u32 w, u64 offset) {
int wid = get_local_id(0);
+ i_buffer[wid] = instructions[wid];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int gid = get_global_id(0);
int gsize = get_global_size(0);
int wsize = get_local_size(0);
+ u32 w64 = wsize / 64 + !!(wsize & 63);
+ u64 result = 0;
+ u64 own = permutations[gid + offset];
+ u64 correct_mask = (1 << (n + 1)) - 2;
- i_buffer[wid] = instructions[wid];
- barrier(CLK_LOCAL_MEM_FENCE);
- u64 result = 0;
- unsigned long own = permutations[gid + offset];
- for (int j = 0; j < wsize; j++) {
+ int j;
+ for (j = 0; j < wsize; j++) {
unsigned long curr_mask = i_buffer[j] | own;
unsigned long tmask, sum, stones;
stones = tmask = sum = 0;
for (int i = 1; i <= w + 1; i++) {
- if ((curr_mask & (1 << i)) == 0) {
- stones += 1;
+ if (!(curr_mask & (1 << i))) {
+ stones++;
tmask |= 1 << (i - sum);
sum = i;
}
}
- if (tmask == (1 << (n + 1)) - 2 && stones == n) {
+ if (tmask == correct_mask && stones == n) {
result |= 1 << (j & 63);
}
- if (j & !(j & 63)) {
- results[gid * (wsize / 64) + j / 64] = result;
+ if (j % 64 == 0 && j) {
+ results[gid * w64 + j / 64] = result;
result = 0;
}
}
+ results[gid * w64 + j / 64] = result;
}
diff --git a/src/solvers/gpusolver.rs b/src/solvers/gpusolver.rs
index 69c400e..3a96568 100644
--- a/src/solvers/gpusolver.rs
+++ b/src/solvers/gpusolver.rs
@@ -1,5 +1,6 @@
use crate::permutations::PermutationGenerator;
use crate::solver::{wall_stats, IteratorSolver, Solver};
+use crate::solvers::opencl;
use crate::structs::StoneWall;
#[derive(Debug)]
@@ -16,14 +17,17 @@ impl GpuSolver {
let src =
std::fs::read_to_string("src/solvers/check.cl").expect("failed to open kernel file");
- crate::solvers::opencl::GpuSolver::launch_sevice(
- &self.masks,
- self.n,
- self.h,
- self.w,
- src.as_ref(),
- )
- .unwrap();
+ let senders =
+ opencl::GpuSolver::launch_sevice(&self.masks, self.n, self.h, self.w, 4, src.as_ref())
+ .unwrap();
+ for i in 0..12 {
+ senders[1 - i / 6]
+ .send(opencl::Job::new(vec![i as u32], self.masks[i]))
+ .unwrap();
+ }
+ loop {
+ std::thread::sleep(std::time::Duration::from_secs(5));
+ }
vec![]
}
}
diff --git a/src/solvers/intuitive.rs b/src/solvers/intuitive.rs
index 3db1d33..bb23a39 100755
--- a/src/solvers/intuitive.rs
+++ b/src/solvers/intuitive.rs
@@ -13,6 +13,7 @@ pub struct NormalSolver {
/// Use to store already used blocks as a bitmask
permutations: Vec<Vec<u32>>,
masks: Vec<u64>,
+ senders: Vec<std::sync::mpsc::Sender<super::opencl::Job>>,
}
static mut TRIES: u32 = 0;
@@ -38,6 +39,12 @@ impl NormalSolver {
masks[j] |= 1 << sum;
}
}
+
+ let src =
+ std::fs::read_to_string("src/solvers/check.cl").expect("failed to open kernel file");
+
+ let senders =
+ super::opencl::GpuSolver::launch_sevice(&masks, n, h, w, 0, src.as_ref()).unwrap();
Self {
n,
h,
@@ -46,14 +53,15 @@ impl NormalSolver {
mask: (1 << w) - 2,
permutations,
masks,
+ senders,
}
}
pub fn solve(&mut self) {
for (n, i) in self.permutations.iter().enumerate() {
let tmp: Vec<u32> = i.iter().map(|x| *x).collect();
- println!("perm {}: {:?}", n, tmp);
- println!("perm {}: {:b}", n, self.masks[n]);
+ //println!("perm {}: {:?}", n, tmp);
+ //println!("perm {}: {:b}", n, self.masks[n]);
}
println!("calculate results");
self.permute(
@@ -66,58 +74,29 @@ impl NormalSolver {
.as_ref(),
);
unsafe { println!("tries: {}\nsolutions: {}", TRIES, SOLUTIONS) }
+ loop {
+ std::thread::sleep(std::time::Duration::from_secs(5));
+ }
}
fn permute(&self, up: usize, index: usize, curr_mask: u64, numbers: &[u32]) {
- if index as usize == numbers.len() {
- //println!("checking {:?}", numbers);
- unsafe {
- TRIES += 1;
- }
- let mut tmask: u64 = 0;
- let mut sum = 0;
- let mut stones = 0;
- for i in 1..=(self.w + 1) {
- if curr_mask & (1 << i) == 0 {
- stones += 1;
- tmask |= 1 << (i - sum);
- sum = i;
- }
- }
- if tmask == (1 << (self.n + 1)) - 2 && stones == self.n {
- println!("tmask: {:b}", tmask);
- println!("curr: {:b}", curr_mask);
- //println!("success");
- unsafe {
- SOLUTIONS += 1;
- }
- for i in numbers {
- println!("{}\t{}", numbers[0], i);
- }
- }
- return;
- }
let mut new_num = Vec::from(numbers);
let start = numbers[index as usize] / self.chunk;
if index as usize == numbers.len() - 1 {
- #[cfg(feature = "gpu")]
- {
- crate::solvers::opencl::check(
- self.masks.as_ref(),
- self.w,
- self.n,
- curr_mask,
- (start * self.chunk) as usize,
- )
+ //#[cfg(feature = "gpu")]
+ //{
+ let i = self.n - 2 - numbers[index] / self.chunk;
+ self.senders[i as usize]
+ .send(super::opencl::Job::new(new_num, curr_mask))
.unwrap();
- return;
- }
+ return;
+ //}
}
for i in start..self.n - (self.h - 1 - index as u32) {
for n in 1..(numbers.len() - index) {
new_num[n + index] = (n as u32 + i) * self.chunk;
}
- if index == 0 && false {
+ /*if index == 0 {
(0..self.chunk).into_par_iter().for_each(|j| {
let mut new_num = new_num.clone();
let tmp = i * self.chunk + j;
@@ -129,17 +108,17 @@ impl NormalSolver {
&new_num,
);
});
- } else {
- for j in 0..self.chunk {
- new_num[index] = i * self.chunk + j;
- self.permute(
- up,
- index + 1,
- curr_mask | self.masks[new_num[index] as usize],
- &new_num,
- );
- }
+ } else {*/
+ for j in 0..self.chunk {
+ new_num[index] = i * self.chunk + j;
+ self.permute(
+ up,
+ index + 1,
+ curr_mask | self.masks[new_num[index] as usize],
+ &new_num,
+ );
}
+ //}
}
}
}
diff --git a/src/solvers/opencl.rs b/src/solvers/opencl.rs
index cdedd37..550ace9 100644
--- a/src/solvers/opencl.rs
+++ b/src/solvers/opencl.rs
@@ -6,6 +6,13 @@ pub struct Job {
bitmask: u64,
}
+impl Job {
+ pub fn new(rows: Vec<u32>, bitmask: u64) -> Self {
+ Self { rows, bitmask }
+ }
+}
+
+#[derive(Debug)]
pub struct GpuSolver {
#[allow(unused)]
platform: Platform,
@@ -18,11 +25,14 @@ pub struct GpuSolver {
n: u32,
h: u32,
w: u32,
+ /// Workgroup size, set to 0 for max
wg_size: usize,
permutations: Buffer<u64>,
rec_queues: Vec<RequestBuffer>,
+ walls: Vec<Vec<u32>>,
}
+#[derive(Debug)]
struct RequestBuffer {
mask_buff: Vec<u64>,
row_buff: Vec<Vec<u32>>,
@@ -33,8 +43,8 @@ struct RequestBuffer {
impl RequestBuffer {
pub fn new(size: usize, receiver: Receiver<Job>) -> Self {
RequestBuffer {
- mask_buff: Vec::with_capacity(size),
- row_buff: Vec::with_capacity(size),
+ mask_buff: vec![0; size],
+ row_buff: vec![Vec::new(); size],
pointer: 0,
receiver,
}
@@ -85,15 +95,15 @@ impl GpuSolver {
.len(permutation_masks.len())
.build()?;
- let mut senders = Vec::with_capacity(h as usize);
- let mut receivers = Vec::with_capacity(h as usize);
+ let mut senders = Vec::with_capacity((n - h + 1) as usize);
+ let mut receivers = Vec::with_capacity((n - h + 1) as usize);
let max_wg_size = device.max_wg_size()?;
if wg_size == 0 {
wg_size = max_wg_size;
} else if wg_size > max_wg_size {
return Err(ocl::Error::from("invalid workgroup size"));
}
- for _ in 0..h {
+ for _ in 0..=(n - h) {
let (sx, rx) = std::sync::mpsc::channel();
senders.push(sx);
receivers.push(RequestBuffer::new(wg_size, rx));
@@ -111,34 +121,40 @@ impl GpuSolver {
wg_size,
permutations: buffer,
rec_queues: receivers,
+ walls: Vec::new(),
};
- std::thread::spawn(move || {
- solver.run();
- });
+ std::thread::Builder::new()
+ .name("GPU Deamon".into())
+ .spawn(move || {
+ solver.run();
+ })
+ .unwrap();
+ println!("started gpu thread");
Ok(senders)
}
fn get_dim(&self, queue: usize) -> usize {
let chunk = self.permutations.len() / self.n as usize;
let dim = (queue + 1) * chunk;
- if dim % self.wg_size == 0 {
- dim
- } else {
- let dim_small = dim / self.wg_size;
- ((dim_small + 1) * chunk)
- }
+ (dim + self.wg_size - 1) / self.wg_size * self.wg_size
}
fn get_off(&self, queue: usize) -> u64 {
let chunk = self.permutations.len() / self.n as usize;
- let dim = (queue + 1) * chunk;
- ((self.n as usize - queue - 1) * chunk - self.wg_size + dim % self.wg_size) as u64
+ let off = self.permutations.len() - chunk - self.get_dim(queue);
+ off as u64
+ }
+ fn get_res(&self, queue: usize) -> usize {
+ let dim = self.get_dim(queue);
+ dim * self.get_res_save_dim()
+ }
+ fn get_res_save_dim(&self) -> usize {
+ (self.wg_size + 63) / 64
}
fn run(mut self) -> ! {
let queues = self.rec_queues.len();
let mut instruction_buffer = Vec::with_capacity((self.n - self.h) as usize);
let mut result_buffer = Vec::with_capacity((self.n - self.h) as usize);
- let chunk = self.permutations.len() / self.n as usize;
for i in 0..queues {
let buffer: Buffer<u64> = Buffer::builder()
@@ -151,17 +167,21 @@ impl GpuSolver {
instruction_buffer.push(buffer);
let results: Buffer<u64> = Buffer::builder()
.queue(self.queue.clone())
- .len(self.wg_size * self.get_dim(i) / 64)
+ .len(self.get_res(i))
.flags(flags::MEM_READ_WRITE)
.build()
.unwrap();
result_buffer.push(results);
}
+ println!("finished gpu setup");
for i in (0..self.rec_queues.len()).cycle() {
if let Some(buffer) = self.rec_queues[i].read() {
instruction_buffer[i].write(buffer).enq().unwrap();
let dim = self.get_dim(i);
+ //println!("dim: {}", dim);
+ //println!("off: {}", self.get_off(i));
+ //println!("result size: {}", self.get_res_save_dim());
let kernel = Kernel::builder()
.program(&self.program)
.name("check")
@@ -189,7 +209,7 @@ impl GpuSolver {
}
// (5) Read results from the device into a vector (`::block` not shown):
- let mut result = vec![0u64; dim * self.wg_size / 64];
+ let mut result = vec![0u64; self.get_res(i)];
result_buffer[i]
.cmd()
.queue(&self.queue)
@@ -197,7 +217,22 @@ impl GpuSolver {
.read(&mut result)
.enq()
.unwrap();
- println!("{:?}", result);
+ for (j, r) in result.iter().enumerate() {
+ if j == 0 {
+ continue;
+ }
+ for b in 0..64 {
+ if r & (1 << b) != 0 {
+ let permutation =
+ j / self.get_res_save_dim() + self.get_off(i) as usize;
+ let instruction = (j % self.get_res_save_dim()) * 64 + b;
+ let mut wall = self.rec_queues[i].get_rows()[instruction].clone();
+ wall.push(permutation as u32);
+ println!("{:?}", wall);
+ self.walls.push(wall);
+ }
+ }
+ }
}
}
panic!();