Decoder code structure is mostly in-place.
diff --git a/core/upb_stream.h b/core/upb_stream.h
index cf01a5f..54fd930 100644
--- a/core/upb_stream.h
+++ b/core/upb_stream.h
@@ -171,14 +171,18 @@
 /* upb_bytesrc ****************************************************************/
 
 // Reads up to "count" bytes into "buf", returning the total number of bytes
-// read.  If <0, indicates error (check upb_bytesrc_status for details).
+// read.  If 0, indicates error and puts details in "status".
 INLINE upb_strlen_t upb_bytesrc_read(upb_bytesrc *src, void *buf,
-                                     upb_strlen_t count);
+                                     upb_strlen_t count, upb_status *status);
 
 // Like upb_bytesrc_read(), but modifies "str" in-place, possibly aliasing
-// existing string data (which avoids a copy).
+// existing string data (which avoids a copy).  On the other hand, if
+// the data was *not* already in an existing string, this copies it into
+// a upb_string, and if the data needs to be put in a specific range of
+// memory (because eg. you need to put it into a different kind of string
+// object) then upb_bytesrc_get() could be better.
 INLINE bool upb_bytesrc_getstr(upb_bytesrc *src, upb_string *str,
-                               upb_strlen_t count);
+                               upb_status *status);
 
 // A convenience function for getting all the remaining data in a upb_bytesrc
 // as a upb_string.  Returns false and sets "status" if the operation fails.
@@ -189,14 +193,6 @@
   return upb_bytesrc_getfullstr(upb_value_getbytesrc(val), str, status);
 }
 
-// Returns the current error status for the stream.
-// Note!  The "eof" flag works like feof() in C; it cannot report end-of-file
-// until a read has failed due to eof.  It cannot preemptively tell you that
-// the next call will fail due to eof.  Since these are the semantics that C
-// and UNIX provide, we're stuck with them if we want to support eg. stdio.
-INLINE upb_status *upb_bytesrc_status(upb_bytesrc *src);
-INLINE bool upb_bytesrc_eof(upb_bytesrc *src);
-
 
 /* upb_bytesink ***************************************************************/
 
diff --git a/core/upb_string.h b/core/upb_string.h
index 1f4b20c..04c0ae9 100644
--- a/core/upb_string.h
+++ b/core/upb_string.h
@@ -3,26 +3,39 @@
  *
  * Copyright (c) 2010 Joshua Haberman.  See LICENSE for details.
  *
- * This file defines a simple string type.  The overriding goal of upb_string
- * is to avoid memcpy(), malloc(), and free() wheverever possible, while
- * keeping both CPU and memory overhead low.  Throughout upb there are
- * situations where one wants to reference all or part of another string
- * without copying.  upb_string provides APIs for doing this.
+ * This file defines a simple string type which is length-delimited instead
+ * of NULL-terminated, and which has useful sharing semantics.
+ *
+ * The overriding goal of upb_string is to avoid memcpy(), malloc(), and free()
+ * wheverever possible, while keeping both CPU and memory overhead low.
+ * Throughout upb there are situations where one wants to reference all or part
+ * of another string without copying.  upb_string provides APIs for doing this.
  *
  * Characteristics of upb_string:
  * - strings are reference-counted.
- * - strings are logically immutable.
+ * - strings are immutable (can be mutated only when first created or recycled).
  * - if a string has no other referents, it can be "recycled" into a new string
  *   without having to reallocate the upb_string.
  * - strings can be substrings of other strings (owning a ref on the source
  *   string).
- * - strings are not thread-safe by default, but can be made so by calling a
- *   function.  This is not the default because it causes extra CPU overhead.
  *
  * Reference-counted strings have recently fallen out of favor because of the
  * performance impacts of doing thread-safe reference counting with atomic
  * operations.  We side-step this issue by not performing atomic operations
  * unless the string has been marked thread-safe.
+ *
+ * Strings are expected to be 8-bit-clean, but "char*" is such an entrenched
+ * idiom that we go with it instead of making our pointers uint8_t*.
+ *
+ * WARNING: THE GETREF, UNREF, AND RECYCLE OPERATIONS ARE NOT THREAD_SAFE
+ * UNLESS THE STRING HAS BEEN MARKED SYNCHRONIZED!  What this means is that if
+ * you are logically passing a reference to a upb_string to another thread
+ * (which implies that the other thread must eventually call unref of recycle),
+ * you have two options:
+ *
+ * - create a copy of the string that will be used in the other thread only.
+ * - call upb_string_get_synchronized_ref(), which will make getref, unref, and
+ *   recycle thread-safe for this upb_string.
  */
 
 #ifndef UPB_STRING_H
@@ -83,10 +96,12 @@
 // longer needed, it should be unref'd, never freed directly.
 upb_string *upb_string_new();
 
+// Internal-only; clients should call upb_string_unref().
 void _upb_string_free(upb_string *str);
 
 // Releases a ref on the given string, which may free the memory.  "str"
-// can be NULL, in which case this is a no-op.
+// can be NULL, in which case this is a no-op.  WARNING: NOT THREAD_SAFE
+// UNLESS THE STRING IS SYNCHRONIZED.
 INLINE void upb_string_unref(upb_string *str) {
   if (str && upb_atomic_read(&str->refcount) > 0 &&
       upb_atomic_unref(&str->refcount)) {
@@ -98,6 +113,7 @@
 
 // Returns a string with the same contents as "str".  The caller owns a ref on
 // the returned string, which may or may not be the same object as "str.
+// WARNING: NOT THREAD-SAFE UNLESS THE STRING IS SYNCHRONIZED!
 INLINE upb_string *upb_string_getref(upb_string *str) {
   int refcount = upb_atomic_read(&str->refcount);
   if (refcount == _UPB_STRING_REFCOUNT_STACK) return upb_strdup(str);
@@ -163,8 +179,11 @@
 // data.  Waiting for a clear use case before actually implementing it.
 //
 // Makes the string "str" a reference to the given string data.  The caller
-// guarantees that the given string data will not change or be deleted until
-// a matching call to upb_string_detach().
+// guarantees that the given string data will not change or be deleted until a
+// matching call to upb_string_detach(), which may block until any concurrent
+// readers have finished reading.  upb_string_detach() preserves the contents
+// of the string by copying the referenced data if there are any other
+// referents.
 // void upb_string_attach(upb_string *str, char *ptr, upb_strlen_t len);
 // void upb_string_detach(upb_string *str);
 
@@ -207,6 +226,22 @@
     _UPB_STRING_INIT(str, sizeof(str)-1, _UPB_STRING_REFCOUNT_STACK)
 #define UPB_STACK_STRING_LEN(str, len) \
     _UPB_STRING_INIT(str, len, _UPB_STRING_REFCOUNT_STACK)
+
+// A convenient way of specifying upb_strings as literals, like:
+//
+//   upb_streql(UPB_STRLIT("expected"), other_str);
+//
+// However, this requires either C99 compound initializers or C++.
+// Must ONLY be called with a string literal as its argument!
+//#ifdef __cplusplus
+//namespace upb {
+//class String : public upb_string {
+//  // This constructor must ONLY be called with a string literal.
+//  String(const char *str) : upb_string(UPB_STATIC_STRING(str)) {}
+//};
+//}
+//#define UPB_STRLIT(str) upb::String(str)
+//#endif
 #define UPB_STRLIT(str) &(upb_string)UPB_STATIC_STRING(str)
 
 /* upb_string library functions ***********************************************/
diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c
index b820b08..fbd7eba 100644
--- a/stream/upb_decoder.c
+++ b/stream/upb_decoder.c
@@ -11,127 +11,39 @@
 #include <stdlib.h>
 #include "upb_def.h"
 
-/* Functions to read wire values. *********************************************/
+/* Pure Decoding **************************************************************/
 
-// These functions are internal to the decode, but might be moved into an
-// internal header file if we at some point in the future opt to do code
-// generation, because the generated code would want to inline these functions.
-// The same applies to the functions to read .proto values below.
+// The key fast-path varint-decoding routine.  There are a lot of possibilities
+// for optimization/experimentation here.
+INLINE bool upb_decode_varint_fast(uint8_t **buf, uint8_t *end, uint64_t &val,
+                                   upb_status *status) {
+  *high = 0;
+  uint32_t b;
+  uint8_t *ptr = p->ptr;
+  b = *(*buf++); *low   = (b & 0x7f)      ; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *low  |= (b & 0x7f) <<  7; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *low  |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *low  |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *low  |= (b & 0x7f) << 28;
+                 *high  = (b & 0x7f) >>  3; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *high |= (b & 0x7f) <<  4; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done;
+  b = *(*buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done;
 
-const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end,
-                                       uint64_t *val, upb_status *status);
-
-// Gets a varint (wire type: UPB_WIRE_TYPE_VARINT).
-INLINE const uint8_t *upb_get_v_uint64_t(const uint8_t *buf, const uint8_t *end,
-                                         uint64_t *val, upb_status *status)
-{
-  // We inline this common case (1-byte varints), if that fails we dispatch to
-  // the full (non-inlined) version.
-  if((*buf & 0x80) == 0) {
-    *val = *buf & 0x7f;
-    return buf + 1;
-  } else {
-    return upb_get_v_uint64_t_full(buf, end, val, status);
-  }
+  upb_seterr(status, UPB_ERROR, "Unterminated varint");
+  return false;
+done:
+  return true;
 }
 
-// Gets a varint -- called when we only need 32 bits of it.  Note that a 32-bit
-// varint is not a true wire type.
-INLINE const uint8_t *upb_get_v_uint32_t(const uint8_t *buf, const uint8_t *end,
-                                         uint32_t *val, upb_status *status)
-{
-  uint64_t val64;
-  const uint8_t *ret = upb_get_v_uint64_t(buf, end, &val64, status);
-  *val = (uint32_t)val64;  // Discard the high bits.
-  return ret;
-}
 
-// Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT).
-INLINE const uint8_t *upb_get_f_uint32_t(const uint8_t *buf, const uint8_t *end,
-                                         uint32_t *val, upb_status *status)
-{
-  const uint8_t *uint32_end = buf + sizeof(uint32_t);
-  if(uint32_end > end) {
-    status->code = UPB_STATUS_NEED_MORE_DATA;
-    return end;
-  }
-  memcpy(val, buf, sizeof(uint32_t));
-  return uint32_end;
-}
-
-// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT).
-INLINE const uint8_t *upb_get_f_uint64_t(const uint8_t *buf, const uint8_t *end,
-                                         uint64_t *val, upb_status *status)
-{
-  const uint8_t *uint64_end = buf + sizeof(uint64_t);
-  if(uint64_end > end) {
-    status->code = UPB_STATUS_NEED_MORE_DATA;
-    return end;
-  }
-  memcpy(val, buf, sizeof(uint64_t));
-  return uint64_end;
-}
-
-INLINE const uint8_t *upb_skip_v_uint64_t(const uint8_t *buf,
-                                          const uint8_t *end,
-                                          upb_status *status)
-{
-  const uint8_t *const maxend = buf + 10;
-  uint8_t last = 0x80;
-  for(; buf < (uint8_t*)end && (last & 0x80); buf++)
-    last = *buf;
-
-  if(buf >= end && buf <= maxend && (last & 0x80)) {
-    status->code = UPB_STATUS_NEED_MORE_DATA;
-    buf = end;
-  } else if(buf > maxend) {
-    status->code = UPB_ERROR_UNTERMINATED_VARINT;
-    buf = end;
-  }
-  return buf;
-}
-
-INLINE const uint8_t *upb_skip_f_uint32_t(const uint8_t *buf,
-                                          const uint8_t *end,
-                                          upb_status *status)
-{
-  const uint8_t *uint32_end = buf + sizeof(uint32_t);
-  if(uint32_end > end) {
-    status->code = UPB_STATUS_NEED_MORE_DATA;
-    return end;
-  }
-  return uint32_end;
-}
-
-INLINE const uint8_t *upb_skip_f_uint64_t(const uint8_t *buf,
-                                          const uint8_t *end,
-                                          upb_status *status)
-{
-  const uint8_t *uint64_end = buf + sizeof(uint64_t);
-  if(uint64_end > end) {
-    status->code = UPB_STATUS_NEED_MORE_DATA;
-    return end;
-  }
-  return uint64_end;
-}
-
-/* Functions to read .proto values. *******************************************/
+/* Decoding/Buffering of individual values ************************************/
 
 // Performs zig-zag decoding, which is used by sint32 and sint64.
 INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }
 INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
 
-// Parses a tag, places the result in *tag.
-INLINE const uint8_t *decode_tag(const uint8_t *buf, const uint8_t *end,
-                                 upb_tag *tag, upb_status *status)
-{
-  uint32_t tag_int;
-  const uint8_t *ret = upb_get_v_uint32_t(buf, end, &tag_int, status);
-  tag->wire_type    = (upb_wire_type_t)(tag_int & 0x07);
-  tag->field_number = tag_int >> 3;
-  return ret;
-}
-
 // The decoder keeps a stack with one entry per level of recursion.
 // upb_decoder_frame is one frame of that stack.
 typedef struct {
@@ -144,6 +56,7 @@
   // Immutable state of the decoder.
   upb_src src;
   upb_dispatcher dispatcher;
+  upb_bytesrc *bytesrc;
   upb_msgdef *toplevel_msgdef;
   upb_decoder_frame stack[UPB_MAX_NESTING];
 
@@ -158,66 +71,108 @@
   // Current input buffer.
   upb_string *buf;
 
-  // The offset within the overall stream represented by the *beginning* of buf.
-  upb_strlen_t buf_stream_offset;
-
-  // Our current offset *within* buf.  Will be negative if we are buffering
-  // from previous buffers in tmpbuf.
+  // Our current offset *within* buf.
   upb_strlen_t buf_offset;
 
-  // Holds any bytes we have from previous buffers.  The number of bytes we
-  // have encoded here is -buf_offset, if buf_offset<0, 0 otherwise.
-  uint8_t tmpbuf[UPB_MAX_ENCODED_SIZE];
+  // The offset within the overall stream represented by the *beginning* of buf.
+  upb_strlen_t buf_stream_offset;
 };
 
-upb_flow_t upb_decode_varint(upb_decoder *d, ptrs *p,
-                             uint32_t *low, uint32_t *high) {
-  if (p->end - p->ptr > UPB_MAX_ENCODED_SIZE) {
-    // Fast path; we know we have a complete varint in our existing buffer.
-    *high = 0;
-    uint32_t b;
-    uint8_t *ptr = p->ptr;
-    b = *(buf++); *low   = (b & 0x7f)      ; if(!(b & 0x80)) goto done;
-    b = *(buf++); *low  |= (b & 0x7f) <<  7; if(!(b & 0x80)) goto done;
-    b = *(buf++); *low  |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
-    b = *(buf++); *low  |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
-    b = *(buf++); *low  |= (b & 0x7f) << 28;
-                  *high  = (b & 0x7f) >>  3; if(!(b & 0x80)) goto done;
-    b = *(buf++); *high |= (b & 0x7f) <<  4; if(!(b & 0x80)) goto done;
-    b = *(buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done;
-    b = *(buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done;
-    b = *(buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done;
-
-    if(bytes_available >= 10) {
-      upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Varint was unterminated "
-                 "after 10 bytes, stream offset: %u", upb_decoder_offset(d));
-      return false;
-    }
-
-  done:
-    p->ptr = ptr;
-  } else {
-    // Slow path: we may have to combine one or more buffers to get a whole
-    // varint worth of data.
-    uint8_t buf[UPB_MAX_ENCODED_SIZE];
-    uint8_t *p = buf, *end = buf + sizeof(buf);
-    for(ing bitpos = 0; p < end && getbyte(d, p) && (last & 0x80); p++, bitpos += 7)
-      *val |= ((uint64_t)((last = *p) & 0x7F)) << bitpos;
-
-    if(d->status->code == UPB_EOF  && (last & 0x80)) {
-      upb_seterr(status, UPB_ERROR,
-                 "Provided data ended in the middle of a varint.\n");
-    } else if(buf == maxend) {
-      upb_seterr(status, UPB_ERROR,
-                 "Varint was unterminated after 10 bytes.\n");
-    } else {
-      // Success.
-      return;
-    }
-    ungetbytes(d, buf, p - buf);
+// Called only from the slow path, this function copies the next "len" bytes
+// from the stream to "data", adjusting "buf" and "end" appropriately.
+INLINE bool upb_getbuf(upb_decoder *d, void *data, size_t len,
+                       uint8_t **buf, uint8_t **end) {
+  while (len > 0) {
+    memcpy(data, *buf, *end-*buf);
+    len -= (*end-*buf);
+    if (!upb_bytesrc_getstr(d->bytesrc, d->buf, d->status)) return false;
+    *buf = upb_string_getrobuf(d->buf);
+    *end = *buf + upb_string_len(d->buf);
   }
 }
 
+// We use this path when we don't have UPB_MAX_ENCODED_SIZE contiguous bytes
+// available in our current buffer.  We don't inline this because we accept
+// that it will be slow and we don't want to pay for two copies of it.
+static bool upb_decode_varint_slow(upb_decoder *d) {
+  uint8_t buf[UPB_MAX_ENCODED_SIZE];
+  uint8_t *p = buf, *end = buf + sizeof(buf);
+  for(int bitpos = 0; p < end && getbyte(d, p) && (last & 0x80); p++, bitpos += 7)
+    *val |= ((uint64_t)((last = *p) & 0x7F)) << bitpos;
+
+  if(d->status->code == UPB_EOF  && (last & 0x80)) {
+    upb_seterr(status, UPB_ERROR,
+               "Provided data ended in the middle of a varint.\n");
+  } else if(buf == maxend) {
+    upb_seterr(status, UPB_ERROR,
+               "Varint was unterminated after 10 bytes.\n");
+  } else {
+    // Success.
+    return;
+  }
+}
+
+INLINE bool upb_decode_tag(upb_decoder *d, const uint8_t **_buf,
+                           const uint8_t **end, upb_tag *tag) {
+  const uint8_t *buf = *_buf, *end = *_end;
+  uint32_t tag_int;
+  // Nearly all tag varints will be either 1 byte (1-16) or 2 bytes (17-2048).
+  if (end - buf < 2) goto slow;  // unlikely.
+  tag_int = *buf & 0x7f;
+  if ((*(buf++) & 0x80) == 0) goto done;  // predictable if fields are in order
+  tag_int |= (*buf & 0x7f) << 7;
+  if ((*(buf++) & 0x80) != 0) goto slow;  // unlikely.
+slow:
+  if (!upb_decode_varint_slow(d, _buf, _end)) return false;
+  buf = *_buf;  // Trick the next line into not overwriting us.
+done:
+  *_buf = buf;
+  tag->wire_type = (upb_wire_type_t)(tag_int & 0x07);
+  tag->field_number = tag_int >> 3;
+  return true;
+}
+
+INLINE bool upb_decode_varint(upb_decoder *d, ptrs *p,
+                              uint32_t *low, uint32_t *high) {
+  if (p->end - p->ptr >= UPB_MAX_VARINT_ENCODED_SIZE)
+    return upb_decode_varint_fast(d);
+  else
+    return upb_decode_varint_slow(d);
+}
+
+INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt,
+                             uint8_t **buf, uint8_t **end, upb_value *val) {
+  static const char table = {0, 8, 0, 0, 0, 4};
+  size_t bytes = table[wt];
+  if (*end - *buf >= bytes) {
+    // Common (fast) case.
+    memcpy(&val, *buf, bytes);
+    *buf += bytes;
+  } else {
+    if (!upb_getbuf(d, &val, bytes, buf, end)) return false;
+  }
+  return true;
+}
+
+// "val" initially holds the length of the string, this is replaced by the
+// contents of the string.
+INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) {
+  upb_string_recycle(str);
+  upb_strlen_t len = upb_valu_getint32(*val);
+  if (*end - *buf >= len) {
+    // Common (fast) case.
+    upb_string_substr(*str, d->buf, *buf - upb_string_getrobuf(d->buf), len);
+    *buf += len;
+  } else {
+    if (!upb_getbuf(d, upb_string_getrwbuf(*str, len), len, buf, end))
+      return false;
+  }
+  return true;
+}
+
+
+/* The main decoding loop *****************************************************/
+
 static const void *get_msgend(upb_decoder *d)
 {
   if(d->top->end_offset > 0)
@@ -238,36 +193,29 @@
   return upb_types[ft].expected_wire_type == wt;
 }
 
-
-// Pushes a new stack frame for a submessage with the given len (which will
-// be zero if the submessage is a group).
-static const uint8_t *push(upb_decoder *d, const uint8_t *start,
+static bool upb_push(upb_decoder *d, const uint8_t *start,
                            uint32_t submsg_len, upb_fielddef *f,
                            upb_status *status)
 {
   d->top->field = f;
   d->top++;
   if(d->top >= d->limit) {
-    upb_seterr(status, UPB_ERROR_MAX_NESTING_EXCEEDED,
-               "Nesting exceeded maximum (%d levels)\n",
-               UPB_MAX_NESTING);
-    return NULL;
+    upb_seterr(status, UPB_ERROR, "Nesting too deep.");
+    return false;
   }
-  upb_decoder_frame *frame = d->top;
-  frame->end_offset = d->completed_offset + submsg_len;
-  frame->msgdef = upb_downcast_msgdef(f->def);
-
-  upb_dispatch_startsubmsg(&d->dispatcher, f);
-  return get_msgend(d);
+  d->top->end_offset = d->completed_offset + submsg_len;
+  d->top->msgdef = upb_downcast_msgdef(f->def);
+  *submsg_end = get_msgend(d);
+  if (!upb_dispatch_startsubmsg(&d->dispatcher, f)) return false;
+  return true;
 }
 
-// Pops a stack frame, returning a pointer for where the next submsg should
-// end (or a pointer that is out of range for a group).
-static const void *pop(upb_decoder *d, const uint8_t *start, upb_status *status)
+static bool upb_pop(upb_decoder *d, const uint8_t *start, upb_status *status)
 {
   d->top--;
   upb_dispatch_endsubmsg(&d->dispatcher);
-  return get_msgend(d);
+  *submsg_end = get_msgend(d);
+  return true;
 }
 
 void upb_decoder_run(upb_src *src, upb_status *status) {
@@ -278,11 +226,13 @@
   upb_msgdef *msgdef = d->top->msgdef;
   upb_string *str = NULL;
 
+  upb_dispatch_startmsg(&d->dispatcher);
+
   // Main loop: executed once per tag/field pair.
   while(1) {
     // Parse/handle tag.
     upb_tag tag;
-    CHECK(decode_tag(d, &buf, &end, &tag));
+    CHECK(upb_decode_tag(d, &buf, &end, &tag));
 
     // Decode wire data.  Hopefully this branch will predict pretty well
     // since most types will read a varint here.
@@ -290,24 +240,19 @@
     switch (tag.wire_type) {
       case UPB_WIRE_TYPE_END_GROUP:
         if(!isgroup(submsg_end)) {
-          upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current "
-                     "message is not a group, byte offset: %zd",
-                     d->completed_offset + (completed - start));
+          upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag.");
           goto err;
         }
-        submsg_end = pop(d, start, status, &msgdef);
-        completed = buf;
-        goto check_msgend;
+        CHECK(upb_pop(d, start, status, &msgdef, &submsg_end));
+        goto check_msgend;  // We have no value to dispatch.
       case UPB_WIRE_TYPE_VARINT:
       case UPB_WIRE_TYPE_DELIMITED:
         // For the delimited case we are parsing the length.
         CHECK(upb_decode_varint(d, &buf, &end, &val));
         break;
       case UPB_WIRE_TYPE_32BIT:
-        CHECK(upb_decode_32bit(d, &buf, &end, &val));
-        break;
       case UPB_WIRE_TYPE_64BIT:
-        CHECK(upb_decode_64bit(d, &buf, &end, &val));
+        CHECK(upb_decode_fixed(d, tag.wire_type, &buf, &end, &val));
         break;
     }
 
@@ -315,24 +260,31 @@
     upb_fielddef *f = upb_msg_itof(msgdef, tag.field_number);
 
     if (!f) {
-      // Unknown field.
+      if (tag.wire_type == UPB_WIRE_TYPE_DELIMITED)
+        CHECK(upb_decode_string(d, &val, &str));
+      CHECK(upb_dispatch_unknownval(d, tag.field_number, val));
     } else if (!upb_check_type(tag.wire_type, f->type)) {
-      // Field has incorrect type.
+      // TODO: put more details in this error msg.
+      upb_seterr(status, UPB_ERROR, "Field had incorrect type.");
+      goto err;
     }
 
     // Perform any further massaging of the data now that we have the fielddef.
     // Now we can distinguish strings from submessages, and we know about
     // zig-zag-encoded types.
     // TODO: handle packed encoding.
+    // TODO: if we were being paranoid, we could check for 32-bit-varint types
+    // that the top 32 bits all match the highest bit of the low 32 bits.
+    // If this is not true we are losing data.  But the main protobuf library
+    // doesn't check this, and it would slow us down, so pass for now.
     switch (f->type) {
       case UPB_TYPE(MESSAGE):
       case UPB_TYPE(GROUP):
-        CHECK(push(d, start, upb_value_getint32(val), f, status, &msgdef));
-        goto check_msgend;
+        CHECK(upb_push(d, start, upb_value_getint32(val), f, status, &msgdef));
+        goto check_msgend;  // We have no value to dispatch.
       case UPB_TYPE(STRING):
       case UPB_TYPE(BYTES):
-        CHECK(upb_decode_string(d, str, upb_value_getint32(val)));
-        upb_value_setstr(&val, str);
+        CHECK(upb_decode_string(d, &val, &str));
         break;
       case UPB_TYPE(SINT32):
         upb_value_setint32(&val, upb_zzdec_32(upb_value_getint32(val)));
@@ -341,26 +293,27 @@
         upb_value_setint64(&val, upb_zzdec_64(upb_value_getint64(val)));
         break;
       default:
-        // Other types need no further processing at this point.
+        break;  // Other types need no further processing at this point.
     }
     CHECK(upb_dispatch_value(d->sink, f, val, status));
 
 check_msgend:
     while(buf >= submsg_end) {
       if(buf > submsg_end) {
-        upb_seterr(status, UPB_ERROR, "Expected submsg end offset "
-                   "did not lie on a tag/value boundary.");
+        upb_seterr(status, UPB_ERROR, "Bad submessage end.")
         goto err;
       }
-      submsg_end = pop(d, start, status, &msgdef);
+      CHECK(upb_pop(d, start, status, &msgdef, &submsg_end));
     }
-    completed = buf;
   }
 
+  CHECK(upb_dispatch_endmsg(&d->dispatcher));
+  return;
+
 err:
-  read = (char*)completed - (char*)start;
-  d->completed_offset += read;
-  return read;
+  if (upb_ok(status)) {
+    upb_seterr(status, UPB_ERROR, "Callback returned UPB_BREAK");
+  }
 }
 
 void upb_decoder_sethandlers(upb_src *src, upb_handlers *handlers) {