// 3x3 grid
// based on Littman, Cassandra and Kaelbling
// Learning policies for partially observable environments: Scaling up  
// Technical Report CS, Brown University

pomdp

const int N = 3; // grid size

// only the target is observable which is in the south east corner
// (also if the initialisation step has been done)
formula target = x=N-1 & y=0;
observable "target" = target;
observable "started" = started;

module grid
	
	x : [0..N-1]; // x coordinate
	y : [0..N-1]; // y coordinate
	started : bool; // initialised?
	
	// initially randomly placed within the grid (not at the target)
	[] !started -> 1/8 : (started'=true) & (x'=0) & (y'=0)
			+ 1/8 : (started'=true) & (x'=0) & (y'=1)
			+ 1/8 : (started'=true) & (x'=0) & (y'=2)
			+ 1/8 : (started'=true) & (x'=1) & (y'=0)
			+ 1/8 : (started'=true) & (x'=1) & (y'=1)
			+ 1/8 : (started'=true) & (x'=1) & (y'=2)
			// + 1/8 : (started'=true) & (x'=2) & (y'=0) the target
			+ 1/8 : (started'=true) & (x'=2) & (y'=1)
			+ 1/8 : (started'=true) & (x'=2) & (y'=2);
			
	// move around the grid
	[east] started & !target -> (x'=min(x+1,N-1));
	[west] started & !target -> (x'=max(x-1,0));
	[north] started & !target -> (x'=min(y+1,N-1));
	[south] started & !target -> (y'=max(y-1,0));
	
	// reached target
	[done] target -> true;
	
endmodule

// reward structure for number of steps to reach the target
rewards
        [east] true : 1;
        [west] true : 1;
        [north] true : 1;
        [south] true : 1;
endrewards