Hi,

The appended patch is an experimental patch to make some arithmetic
operations work even for the data/result were denormalized (subnormal)
floats and doubles.  SH-4 FPU differs at this point with the other
FPUs and issues FPU exceptions when it sees subnormals in some basic
FPU insns.  This causes a few corner case problems and makes noises
in the toolchain tests.  SH-4 manual implies that such case should
be handled with the software emulation and we"ve already done it for
fcnvsd insn.
The patch tries to emulate expected operations for fadd/fsub/fmul
insns on the FPU error.  It uses its own software math emulation
routines specialized to the subnormals.  We could use software math
routines in SH-3 libgcc.a instead of them, though kernel people
wouldn"t like to import more libgcc functions.

This patch would be unnecessary for the normal uses and isn"t tested
well.  It might be not for mainline, your eyes only :-)

Regards,
	kaz
--
	* arch/sh/kernel/cpu/sh4/fpu.c (denormal_mulf): New function.
	(mult64, rshift64, denormal_muld, denormal_subf1, denormal_addf1,
	 denormal_addf, denormal_subd1, denormal_addd1, denormal_addd):
	Likewise.
 	(ieee_fpe_handler): Handle fpu errors for fmul, fadd and fsub.
	(do_fpu_error): Clear fpscr flags and restore the fpu context
	when the emulation is done.  Don"t update pc when issuing SIGFPE.

--- linux-2.6.11-orig/arch/sh/kernel/cpu/sh4/fpu.c	Thu Mar  3 17:28:16 2005
+++ linux-2.6.11-local/arch/sh/kernel/cpu/sh4/fpu.c	Fri Mar 25 13:42:11 2005
@@ -186,6 +186,286 @@ fpu_init(void)
  	disable_fpu();
 }
 
+/*
+ *	Emulate arithmetic ops on denormalized number for some FPU insns.
+ */
+
+/* denormalized float * float */
+static int denormal_mulf(int hx, int hy)
+{
+	unsigned int ix, iy;
+	unsigned long long m, n;
+	int exp, w;
+
+	ix = hx & 0x7fffffff;
+	iy = hy & 0x7fffffff;
+	if (iy < 0x00800000 || ix == 0)
+		return ((hx ^ hy) & 0x80000000);
+
+	exp = (iy & 0x7f800000) >> 23;
+	ix &= 0x007fffff;
+	iy = (iy & 0x007fffff) | 0x00800000;
+	m = (unsigned long long)ix * iy;
+	n = m;
+	w = -1;
+	while (n) { n >>= 1; w++; }
+
+	/* FIXME: use guard bits */	
+	exp += w - 126 - 46;
+	if (exp > 0)
+		ix = ((int) (m >> (w - 23)) & 0x007fffff) | (exp << 23);
+	else if (exp + 22 >= 0) 
+		ix = (int) (m >> (w - 22 - exp)) & 0x007fffff;
+	else
+		ix = 0;
+
+	ix |= (hx ^ hy) & 0x80000000;
+	return ix;
+}
+
+/* denormalized double * double */
+static void mult64(unsigned long long x, unsigned long long y,
+		unsigned long long *highp, unsigned long long *lowp)
+{
+	unsigned long long sub0, sub1, sub2, sub3;
+	unsigned long long high, low;
+
+	sub0 = (x >> 32) * (unsigned long) (y >> 32);
+	sub1 = (x & 0xffffffffLL) * (unsigned long) (y >> 32);
+	sub2 = (x >> 32) * (unsigned long) (y & 0xffffffffLL);
+	sub3 = (x & 0xffffffffLL) * (unsigned long) (y & 0xffffffffLL);
+	low = sub3;
+	high = 0LL;
+	sub3 += (sub1 << 32);
+	if (low > sub3)
+		high++;
+	low = sub3;
+	sub3 += (sub2 << 32);
+	if (low > sub3)
+		high++;
+	low = sub3;
+	high += (sub1 >> 32) + (sub2 >> 32);
+	high += sub0;
+	*lowp = low;
+	*highp = high;
+}
+
+static inline long long rshift64(unsigned long long mh,
+		unsigned long long ml, int n)
+{
+	if (n >= 64)
+		return mh >> (n - 64);
+	return (mh << (64 - n)) | (ml >> n);
+}
+
+static long long denormal_muld(long long hx, long long hy)
+{
+	unsigned long long ix, iy;
+	unsigned long long mh, ml, nh, nl;
+	int exp, w;
+
+	ix = hx & 0x7fffffffffffffffLL;
+	iy = hy & 0x7fffffffffffffffLL;
+	if (iy < 0x0010000000000000LL || ix == 0)
+		return ((hx ^ hy) & 0x8000000000000000LL);
+
+	exp = (iy & 0x7ff0000000000000LL) >> 52;
+	ix &= 0x000fffffffffffffLL;
+	iy = (iy & 0x000fffffffffffffLL) | 0x0010000000000000LL;
+	mult64(ix, iy, &mh, &ml);
+	nh = mh;
+	nl = ml;
+	w = -1;
+	if (nh) {
+		while (nh) { nh >>= 1; w++;}
+		w += 64;
+	} else
+		while (nl) { nl >>= 1; w++;}
+
+	/* FIXME: use guard bits */	
+	exp += w - 1022 - 52 * 2;
+	if (exp > 0)
+		ix = (rshift64(mh, ml, w - 52) & 0x000fffffffffffffLL)
+			| ((long long)exp << 52);
+	else if (exp + 51 >= 0) 
+		ix = rshift64(mh, ml, w - 51 - exp) & 0x000fffffffffffffLL;
+	else
+		ix = 0;
+
+	ix |= (hx ^ hy) & 0x8000000000000000LL;
+	return ix;
+}
+
+/* ix - iy where iy: denormal and ix, iy >= 0 */
+static int denormal_subf1(unsigned int ix, unsigned int iy)
+{
+	int frac;
+	int exp;
+
+	if (ix < 0x00800000)
+		return ix - iy;
+
+	exp = (ix & 0x7f800000) >> 23;
+	if (exp - 1 > 31)
+		return ix;
+	iy >>= exp - 1;
+	if (iy == 0)
+		return ix;
+
+	frac = (ix & 0x007fffff) | 0x00800000;
+	frac -= iy;
+	while (frac < 0x00800000) {
+		if (--exp == 0)
+			return frac;
+		frac <<= 1;
+	}
+
+	return (exp << 23) | (frac & 0x007fffff);
+}
+
+/* ix + iy where iy: denormal and ix, iy >= 0 */
+static int denormal_addf1(unsigned int ix, unsigned int iy)
+{
+	int frac;
+	int exp;
+
+	if (ix < 0x00800000)
+		return ix + iy;
+
+	exp = (ix & 0x7f800000) >> 23;
+	if (exp - 1 > 31)
+		return ix;
+	iy >>= exp - 1;
+	if (iy == 0)
+	  return ix;
+
+	frac = (ix & 0x007fffff) | 0x00800000;
+	frac += iy;
+	if (frac >= 0x01000000) {
+		frac >>= 1;
+		++exp;
+	}
+
+	return (exp << 23) | (frac & 0x007fffff);
+}
+
+static int denormal_addf(int hx, int hy)
+{
+	unsigned int ix, iy;
+	int sign;
+
+	if ((hx ^ hy) & 0x80000000) {
+		sign = hx & 0x80000000;
+		ix = hx & 0x7fffffff;
+		iy = hy & 0x7fffffff;
+		if (iy < 0x00800000) {
+			ix = denormal_subf1(ix, iy);
+			if (ix < 0) {
+				ix = -ix;
+				sign ^= 0x80000000;
+			}
+		} else {
+			ix = denormal_subf1(iy, ix);
+			sign ^= 0x80000000;
+		}
+	} else {
+		sign = hx & 0x80000000;
+		ix = hx & 0x7fffffff;
+		iy = hy & 0x7fffffff;
+		if (iy < 0x00800000)
+			ix = denormal_addf1(ix, iy);
+		else
+			ix = denormal_addf1(iy, ix);
+	}
+
+	return sign | ix;
+}
+
+/* ix - iy where iy: denormal and ix, iy >= 0 */
+static long long denormal_subd1(unsigned long long ix, unsigned long long iy)
+{
+	long long frac;
+	int exp;
+
+	if (ix < 0x0010000000000000LL)
+		return ix - iy;
+
+	exp = (ix & 0x7ff0000000000000LL) >> 52;
+	if (exp - 1 > 63)
+		return ix;
+	iy >>= exp - 1;
+	if (iy == 0)
+		return ix;
+
+	frac = (ix & 0x000fffffffffffffLL) | 0x0010000000000000LL;
+	frac -= iy;
+	while (frac < 0x0010000000000000LL) {
+		if (--exp == 0)
+			return frac;
+		frac <<= 1;
+	}
+
+	return ((long long)exp << 52) | (frac & 0x000fffffffffffffLL);
+}
+
+/* ix + iy where iy: denormal and ix, iy >= 0 */
+static long long denormal_addd1(unsigned long long ix, unsigned long long iy)
+{
+	long long frac;
+	long long exp;
+
+	if (ix < 0x0010000000000000LL)
+		return ix + iy;
+
+	exp = (ix & 0x7ff0000000000000LL) >> 52;
+	if (exp - 1 > 63)
+		return ix;
+	iy >>= exp - 1;
+	if (iy == 0)
+	  return ix;
+
+	frac = (ix & 0x000fffffffffffffLL) | 0x0010000000000000LL;
+	frac += iy;
+	if (frac >= 0x0020000000000000LL) {
+		frac >>= 1;
+		++exp;
+	}
+
+	return (exp << 52) | (frac & 0x000fffffffffffffLL);
+}
+
+static long long denormal_addd(long long hx, long long hy)
+{
+	unsigned long long ix, iy;
+	long long sign;
+
+	if ((hx ^ hy) & 0x8000000000000000LL) {
+		sign = hx & 0x8000000000000000LL;
+		ix = hx & 0x7fffffffffffffffLL;
+		iy = hy & 0x7fffffffffffffffLL;
+		if (iy < 0x0010000000000000LL) {
+			ix = denormal_subd1(ix, iy);
+			if (ix < 0) {
+				ix = -ix;
+				sign ^= 0x8000000000000000LL;
+			}
+		} else {
+			ix = denormal_subd1(iy, ix);
+			sign ^= 0x8000000000000000LL;
+		}
+	} else {
+		sign = hx & 0x8000000000000000LL;
+		ix = hx & 0x7fffffffffffffffLL;
+		iy = hy & 0x7fffffffffffffffLL;
+		if (iy < 0x0010000000000000LL)
+			ix = denormal_addd1(ix, iy);
+		else
+			ix = denormal_addd1(iy, ix);
+	}
+
+	return sign | ix;
+}
+
 /**
  *	denormal_to_double - Given denormalized float number,
  *	                     store double float
@@ -269,24 +549,103 @@ ieee_fpe_handler (struct pt_regs *regs)
 		finsn = insn;
 	}
 
+#define FPSCR_FPU_ERROR (1 << 17)
+
 	if ((finsn & 0xf1ff) == 0xf0ad) { /* fcnvsd */
 		struct task_struct *tsk = current;
 
-		save_fpu(tsk, regs);
-		if ((tsk->thread.fpu.hard.fpscr & (1 << 17))) {
+		if ((tsk->thread.fpu.hard.fpscr & FPSCR_FPU_ERROR)) {
 			/* FPU error */
 			denormal_to_double (&tsk->thread.fpu.hard,
 					    (finsn >> 8) & 0xf);
-			tsk->thread.fpu.hard.fpscr &=
-				~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
-			grab_fpu(regs);
-			restore_fpu(tsk);
-			set_tsk_thread_flag(tsk, TIF_USEDFPU);
-		} else {
-			tsk->thread.trap_no = 11;
-			tsk->thread.error_code = 0;
-			force_sig(SIGFPE, tsk);
-		}
+		} else
+			return 0;
+
+		regs->pc = nextpc;
+		return 1;
+	} else if ((finsn & 0xf00f) == 0xf002) { /* fmul */
+		struct task_struct *tsk = current;
+		int fpscr;
+		int n, m, prec;
+		unsigned int hx, hy;
+
+		n = (finsn >> 8) & 0xf;
+		m = (finsn >> 4) & 0xf;
+		hx = tsk->thread.fpu.hard.fp_regs[n];
+		hy = tsk->thread.fpu.hard.fp_regs[m];
+		fpscr = tsk->thread.fpu.hard.fpscr;
+		prec = fpscr & (1 << 19);
+
+		if ((fpscr & FPSCR_FPU_ERROR)
+		     && (prec && ((hx & 0x7fffffff) < 0x00100000
+				   || (hy & 0x7fffffff) < 0x00100000))) {
+			long long llx, lly;
+
+			/* FPU error because of denormal */
+			llx = ((long long) hx << 32)
+			       | tsk->thread.fpu.hard.fp_regs[n+1];
+			lly = ((long long) hy << 32)
+			       | tsk->thread.fpu.hard.fp_regs[m+1];
+			if ((hx & 0x7fffffff) >= 0x00100000)
+				llx = denormal_muld(lly, llx);
+			else
+				llx = denormal_muld(llx, lly);
+			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
+			tsk->thread.fpu.hard.fp_regs[n+1] = llx & 0xffffffff;
+		} else if ((fpscr & FPSCR_FPU_ERROR)
+		     && (!prec && ((hx & 0x7fffffff) < 0x00800000
+				   || (hy & 0x7fffffff) < 0x00800000))) {
+			/* FPU error because of denormal */
+			if ((hx & 0x7fffffff) >= 0x00800000)
+				hx = denormal_mulf(hy, hx);
+			else
+				hx = denormal_mulf(hx, hy);
+			tsk->thread.fpu.hard.fp_regs[n] = hx;
+		} else
+			return 0;
+
+		regs->pc = nextpc;
+		return 1;
+	} else if ((finsn & 0xf00e) == 0xf000) { /* fadd, fsub */
+		struct task_struct *tsk = current;
+		int fpscr;
+		int n, m, prec;
+		unsigned int hx, hy;
+
+		n = (finsn >> 8) & 0xf;
+		m = (finsn >> 4) & 0xf;
+		hx = tsk->thread.fpu.hard.fp_regs[n];
+		hy = tsk->thread.fpu.hard.fp_regs[m];
+		fpscr = tsk->thread.fpu.hard.fpscr;
+		prec = fpscr & (1 << 19);
+
+		if ((fpscr & FPSCR_FPU_ERROR)
+		     && (prec && ((hx & 0x7fffffff) < 0x00100000
+				   || (hy & 0x7fffffff) < 0x00100000))) {
+			long long llx, lly;
+
+			/* FPU error because of denormal */
+			llx = ((long long) hx << 32)
+			       | tsk->thread.fpu.hard.fp_regs[n+1];
+			lly = ((long long) hy << 32)
+			       | tsk->thread.fpu.hard.fp_regs[m+1];
+			if ((finsn & 0xf00f) == 0xf000)
+				llx = denormal_addd(llx, lly);
+			else
+				llx = denormal_addd(llx, lly ^ (1LL << 63));
+			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
+			tsk->thread.fpu.hard.fp_regs[n+1] = llx & 0xffffffff;
+		} else if ((fpscr & FPSCR_FPU_ERROR)
+		     && (!prec && ((hx & 0x7fffffff) < 0x00800000
+				   || (hy & 0x7fffffff) < 0x00800000))) {
+			/* FPU error because of denormal */
+			if ((finsn & 0xf00f) == 0xf000)
+				hx = denormal_addf(hx, hy);
+			else
+				hx = denormal_addf(hx, hy ^ 0x80000000);
+			tsk->thread.fpu.hard.fp_regs[n] = hx;
+		} else
+			return 0;
 
 		regs->pc = nextpc;
 		return 1;
@@ -301,11 +660,16 @@ do_fpu_error(unsigned long r4, unsigned 
 {
 	struct task_struct *tsk = current;
 
-	if (ieee_fpe_handler (&regs))
+	save_fpu(tsk, &regs);
+	if (ieee_fpe_handler (&regs)) {
+		tsk->thread.fpu.hard.fpscr &=
+			~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
+		grab_fpu(&regs);
+		restore_fpu(tsk);
+		set_tsk_thread_flag(tsk, TIF_USEDFPU);
 		return;
+	}
 
-	regs.pc += 2;
-	save_fpu(tsk, &regs);
 	tsk->thread.trap_no = 11;
 	tsk->thread.error_code = 0;
 	force_sig(SIGFPE, tsk);


