diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c
index 5678f53c..e3a97b36 100644
--- a/simd/jccolext-altivec.c
+++ b/simd/jccolext-altivec.c
@@ -28,13 +28,14 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
                                     JSAMPIMAGE output_buf,
                                     JDIMENSION output_row, int num_rows)
 {
-  JSAMPROW inptr;
-  JSAMPROW outptr0, outptr1, outptr2;
-  int pitch;
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
-    rgbg1, rgbg2, rgbg3, y, cb, cr;
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int pitch = img_width * RGB_PIXELSIZE, offset, num_cols;
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0},
+    rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
 #if RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4;
+  __vector unsigned char rgb4 = {0};
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short y01, y23, cr01, cr23, cb01, cb23;
@@ -59,39 +60,112 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
     outptr2 = output_buf[2][output_row];
     output_row++;
 
-    for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
-         pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
          outptr0 += 16, outptr1 += 16, outptr2 += 16) {
 
-#if RGB_PIXELSIZE == 3
-      /* Load 16 pixels == 48 bytes */
-      if ((size_t)inptr & 15) {
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
         __vector unsigned char unaligned_shift_index;
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
+        int bytes = num_cols + offset;
+
+        if (bytes >= (RGB_PIXELSIZE + 1) * 16) {
+          /* Fast path -- we have enough buffer space to load all vectors.
+           * Even if we don't need them all, this is faster than narrowing
+           * down which ones we need.
+           */
+          rgb0 = vec_ld(0, inptr);
           rgb1 = vec_ld(16, inptr);
-        else
-          rgb1 = vec_ld(-1, inptr + pitch);
-        if (pitch > 32)
           rgb2 = vec_ld(32, inptr);
-        else
-          rgb2 = vec_ld(-1, inptr + pitch);
-        if (pitch > 48)
           rgb3 = vec_ld(48, inptr);
-        else
-          rgb3 = vec_ld(-1, inptr + pitch);
+#if RGB_PIXELSIZE == 4
+          rgb4 = vec_ld(64, inptr);
+#endif
+        } else {
+          if (bytes & 15) {
+            /* Slow path to prevent buffer overread.  Since there is no way to
+             * read a partial AltiVec register, overread would occur on the
+             * last chunk of the last image row if the right edge is not on a
+             * 16-byte boundary.  It could also occur on other rows if the
+             * bytes per row is low enough.  Since we can't determine whether
+             * we're on the last image row, we have to assume every row is the
+             * last.
+             */
+            memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+            rgb0 = vec_ld(0, tmpbuf);
+            rgb1 = vec_ld(16, tmpbuf);
+            rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+            rgb3 = vec_ld(48, tmpbuf);
+#endif
+            goto start;  /* Skip permutation */
+          } else {
+            /* Medium path -- if the right edge is vector-aligned, then we can
+             * read full vectors (but with a lot of branches.)
+             */
+            rgb0 = vec_ld(0, inptr);
+            if (bytes > 16) {
+              rgb1 = vec_ld(16, inptr);
+              if (bytes > 32) {
+                rgb2 = vec_ld(32, inptr);
+                if (bytes > 48) {
+                  rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+                  if (bytes > 64)
+                    rgb4 = vec_ld(64, inptr);
+#endif
+                }
+              }
+            }
+          }
+        }
+
         unaligned_shift_index = vec_lvsl(0, inptr);
         rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
         rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
         rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
       } else {
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
+        if (num_cols >= RGB_PIXELSIZE * 16) {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
           rgb1 = vec_ld(16, inptr);
-        if (pitch > 32)
           rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, inptr);
+#endif
+        } else {
+          if (num_cols & 15) {
+            /* Slow path */
+            memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+            rgb0 = vec_ld(0, tmpbuf);
+            rgb1 = vec_ld(16, tmpbuf);
+            rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+            rgb3 = vec_ld(48, tmpbuf);
+#endif
+          } else {
+            /* Medium path */
+            rgb0 = vec_ld(0, inptr);
+            if (num_cols > 16) {
+              rgb1 = vec_ld(16, inptr);
+              if (num_cols > 32) {
+                rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+                if (num_cols > 48)
+                  rgb3 = vec_ld(48, inptr);
+#endif
+              }
+            }
+          }
+        }
       }
 
+start:
+#if RGB_PIXELSIZE == 3
       /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
        * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
        * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
@@ -106,41 +180,6 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
       rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
       rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
 #else
-      /* Load 16 pixels == 64 bytes */
-      if ((size_t)inptr & 15) {
-        __vector unsigned char unaligned_shift_index;
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
-          rgb1 = vec_ld(16, inptr);
-        else
-          rgb1 = vec_ld(-1, inptr + pitch);
-        if (pitch > 32)
-          rgb2 = vec_ld(32, inptr);
-        else
-          rgb2 = vec_ld(-1, inptr + pitch);
-        if (pitch > 48)
-          rgb3 = vec_ld(48, inptr);
-        else
-          rgb3 = vec_ld(-1, inptr + pitch);
-        if (pitch > 64)
-          rgb4 = vec_ld(64, inptr);
-        else
-          rgb4 = vec_ld(-1, inptr + pitch);
-        unaligned_shift_index = vec_lvsl(0, inptr);
-        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
-        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
-        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
-        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
-      } else {
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
-          rgb1 = vec_ld(16, inptr);
-        if (pitch > 32)
-          rgb2 = vec_ld(32, inptr);
-        if (pitch > 48)
-          rgb3 = vec_ld(48, inptr);
-      }
-
       /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
        * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
        * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
diff --git a/simd/jccolor-altivec.c b/simd/jccolor-altivec.c
index 04b8708d..acb0357a 100644
--- a/simd/jccolor-altivec.c
+++ b/simd/jccolor-altivec.c
@@ -25,6 +25,11 @@
 #include "jsimd_altivec.h"
 
 
+#ifndef min
+#define min(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+
 #define F_0_081 5329                 /* FIX(0.08131) */
 #define F_0_114 7471                 /* FIX(0.11400) */
 #define F_0_168 11059                /* FIX(0.16874) */
diff --git a/simd/jcgray-altivec.c b/simd/jcgray-altivec.c
index b52fade0..7fc0e478 100644
--- a/simd/jcgray-altivec.c
+++ b/simd/jcgray-altivec.c
@@ -25,6 +25,11 @@
 #include "jsimd_altivec.h"
 
 
+#ifndef min
+#define min(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+
 #define F_0_114 7471                 /* FIX(0.11400) */
 #define F_0_250 16384                /* FIX(0.25000) */
 #define F_0_299 19595                /* FIX(0.29900) */
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c
index c1d0a453..93377444 100644
--- a/simd/jcgryext-altivec.c
+++ b/simd/jcgryext-altivec.c
@@ -30,11 +30,13 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
                                      JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr;
-  int pitch;
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
-    rgbg1, rgbg2, rgbg3, y;
+  int pitch = img_width * RGB_PIXELSIZE, offset, num_cols;
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0},
+    rgbg0, rgbg1, rgbg2, rgbg3, y;
 #if RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4;
+  __vector unsigned char rgb4 = {0};
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short y01, y23;
@@ -53,39 +55,112 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
     outptr = output_buf[0][output_row];
     output_row++;
 
-    for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
-         pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
          outptr += 16) {
 
-#if RGB_PIXELSIZE == 3
-      /* Load 16 pixels == 48 bytes */
-      if ((size_t)inptr & 15) {
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
         __vector unsigned char unaligned_shift_index;
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
+        int bytes = num_cols + offset;
+
+        if (bytes >= (RGB_PIXELSIZE + 1) * 16) {
+          /* Fast path -- we have enough buffer space to load all vectors.
+           * Even if we don't need them all, this is faster than narrowing
+           * down which ones we need.
+           */
+          rgb0 = vec_ld(0, inptr);
           rgb1 = vec_ld(16, inptr);
-        else
-          rgb1 = vec_ld(-1, inptr + pitch);
-        if (pitch > 32)
           rgb2 = vec_ld(32, inptr);
-        else
-          rgb2 = vec_ld(-1, inptr + pitch);
-        if (pitch > 48)
           rgb3 = vec_ld(48, inptr);
-        else
-          rgb3 = vec_ld(-1, inptr + pitch);
+#if RGB_PIXELSIZE == 4
+          rgb4 = vec_ld(64, inptr);
+#endif
+        } else {
+          if (bytes & 15) {
+            /* Slow path to prevent buffer overread.  Since there is no way to
+             * read a partial AltiVec register, overread would occur on the
+             * last chunk of the last image row if the right edge is not on a
+             * 16-byte boundary.  It could also occur on other rows if the
+             * bytes per row is low enough.  Since we can't determine whether
+             * we're on the last image row, we have to assume every row is the
+             * last.
+             */
+            memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+            rgb0 = vec_ld(0, tmpbuf);
+            rgb1 = vec_ld(16, tmpbuf);
+            rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+            rgb3 = vec_ld(48, tmpbuf);
+#endif
+            goto start;  /* Skip permutation */
+          } else {
+            /* Medium path -- if the right edge is vector-aligned, then we can
+             * read full vectors (but with a lot of branches.)
+             */
+            rgb0 = vec_ld(0, inptr);
+            if (bytes > 16) {
+              rgb1 = vec_ld(16, inptr);
+              if (bytes > 32) {
+                rgb2 = vec_ld(32, inptr);
+                if (bytes > 48) {
+                  rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+                  if (bytes > 64)
+                    rgb4 = vec_ld(64, inptr);
+#endif
+                }
+              }
+            }
+          }
+        }
+
         unaligned_shift_index = vec_lvsl(0, inptr);
         rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
         rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
         rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
       } else {
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
+        if (num_cols >= RGB_PIXELSIZE * 16) {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
           rgb1 = vec_ld(16, inptr);
-        if (pitch > 32)
           rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, inptr);
+#endif
+        } else {
+          if (num_cols & 15) {
+            /* Slow path */
+            memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+            rgb0 = vec_ld(0, tmpbuf);
+            rgb1 = vec_ld(16, tmpbuf);
+            rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+            rgb3 = vec_ld(48, tmpbuf);
+#endif
+          } else {
+            /* Medium path */
+            rgb0 = vec_ld(0, inptr);
+            if (num_cols > 16) {
+              rgb1 = vec_ld(16, inptr);
+              if (num_cols > 32) {
+                rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+                if (num_cols > 48)
+                  rgb3 = vec_ld(48, inptr);
+#endif
+              }
+            }
+          }
+        }
       }
 
+start:
+#if RGB_PIXELSIZE == 3
       /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
        * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
        * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
@@ -100,41 +175,6 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
       rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
       rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
 #else
-      /* Load 16 pixels == 64 bytes */
-      if ((size_t)inptr & 15) {
-        __vector unsigned char unaligned_shift_index;
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
-          rgb1 = vec_ld(16, inptr);
-        else
-          rgb1 = vec_ld(-1, inptr + pitch);
-        if (pitch > 32)
-          rgb2 = vec_ld(32, inptr);
-        else
-          rgb2 = vec_ld(-1, inptr + pitch);
-        if (pitch > 48)
-          rgb3 = vec_ld(48, inptr);
-        else
-          rgb3 = vec_ld(-1, inptr + pitch);
-        if (pitch > 64)
-          rgb4 = vec_ld(64, inptr);
-        else
-          rgb4 = vec_ld(-1, inptr + pitch);
-        unaligned_shift_index = vec_lvsl(0, inptr);
-        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
-        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
-        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
-        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
-      } else {
-        rgb0 = vec_ld(0, inptr);
-        if (pitch > 16)
-          rgb1 = vec_ld(16, inptr);
-        if (pitch > 32)
-          rgb2 = vec_ld(32, inptr);
-        if (pitch > 48)
-          rgb3 = vec_ld(48, inptr);
-      }
-
       /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
        * rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
        * rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb