 
                
        
     
                
        
     
                
        
     
                
        
    /* iterate over each character in an ASCII/ISO-8859-* string */
while (buffer[index] != 0) {
char c = buffer[index++];
blah;
}
/* get next Unicode character from a utf8 string */
int utf8_get_next(char **ptr) {
if (**ptr & 0x80) {
int decode;
/* utf8 decoding logic here, which results in *ptr pointing to the position following
* the last byte of the UTF8 character sequence, which automatically deals with
* NUL terminators
*/
return decode;
} else {
return *(*ptr++);
}
}
/* iterate over each character in a UTF-8 string */
char *ptr = buffer;
while (*ptr != 0) {
int c = utf8_get_next(&ptr);
blah;
}
char *ptr = buffer;
while (*ptr != 0) {
int c = is_utf8 ? utf8_get_next(&ptr) : *(ptr++);
blah;
}
process_char(int c) {
  if (c == '\n') {
    flush_buffer();
    print_hard_nl();
    return;
  } else if (is_space©) {
    if (!soft_space) {
      flush_buffer();
      print_char©;
    }
    return;
  } else if (is_break_point©) {
    flush_buffer();
    print_char©;
  }
  /* deal with overflow */
  buffer[buffer_length++ ] = c;
}
flush_buffer() {
  if (soft_space)
    print_indent();
  soft_space = false;
  if (buffer_lenght + output_pos > width)
    print_soft_nl();
  print(buffer, buffer_length);
  buffer_length = 0;
}
print_soft_nl() {
  print_char('\n');
  soft_space = true;
  output_col = 0;
}
print_hard_nl() {
  if (!soft_space)
    print_char('\n')
  soft_space = false;
  output_col = 0;
}
/* rest should be self explanatory */bigword
anotherword
 
                
        
    
I'm having a little trouble coming up with the easiest way to get this done. My current code buffers up output until a space is found (or the word buffer fills), checks the width of the buffer against the remaining screen space (as reported by NAWS, or a default of 70), and either spits out the word or spits out a newline followed by the current indentation followed by the word.
It works but it's a woefully incomplete. It doesn't deal with embedded colors at all (not a problem for my MUD, as I never ever change colors mid-word, but other MUDs might want the feature). It assumes one byte == one character == one character cell. It only breaks on spaces.
Fixing some of that I can do with a little work. I can have it decode UTF8 sequences if a Unicode flag is set. I can alter the algorithm to break after hyphens and before punctuation that doesn't end a word. The trick however is dealing with true Unicode text properties. Some Unicode codepoints represent zero-width characters. Some are double-width. The line-break rules of non-Latin alphabets are totally unknown by me.
The best I can find is ICU, but that is a freaking HUGE dependency to pull in. Does anyone know of any alternative, light-weight Unicode-capable libraries (or even just data tables) I can use for this?
Also, is anyone else interested in having a small library for dealing with word wrapping? (Would be usable in a client too.)