Zap Weird Characters from Text Documents!

Or: Convert Extended ASCII Windows Text to Pure ASCII Text for Unix or Linux

Saving a file as "plain ASCII" from MS Word, Adobe Acrobat, or other applications often creates a document that is full of strange non-ASCII characters.  This is because while ASCII is a 7-bit character code, such programs use one of the many 8-bit extended ASCII alphabets.

The following little C program (reads from stdin and writes to stdout) converts everything to standard ASCII and also performs a couple of other chores:

Okay, here is the program.


/* Convert extended-ascii characters to reasonable substitutes,
throw out control characters, wrap long lines, and get rid of
DOS carriage return characters */

#include <stdio.h>

char trans[] = " "
    "E ,f\" * ^%S<  Z "
    " `'\"\".--~ s>  zY"
    "  cL Y|S\"Ca ~  -"
    "o 23'uP.,1o     "
    "SSSSSS CEEEEIIII"
    "DNOOOOOx0UUUUYPB"
    "aaaaaa ceeeeiiii"
    "onooooo/0uuuuypy";

int main(){
    int c; int i;
    i = 0;

    while ((c = getchar())!=-1){

        if (c < ' ' && c!='\t' && c!='\n' && c!='\v' && c!='\f')
            { ; /* throw it out */ }
        else {
            /* special multi-char sequences */
            if (c == 0x85) { printf("..."); }
            else if (c == 0x8C) { printf("OE"); i+= 2; }
            else if (c == 0x88) { printf("**"); i+= 2; }
            else if (c == 0x99) { printf("tm"); i+= 2; }
            else if (c == 0x9C) { printf("oe"); i+= 2; }
            else if (c == 0xA9) { printf("(C)"); i+= 3; }
            else if (c == 0xAB) { printf("<<"); i+= 2; }
            else if (c == 0xAE) { printf("(R)"); i+= 3; }
            else if (c == 0xB1) { printf("+-"); i+= 2; }
            else if (c == 0xBB) { printf(">>"); i+= 2; }
            else if (c == 0xBC) { printf("1/4"); i+= 3; }
            else if (c == 0xBD) { printf("1/2"); i+= 3; }
            else if (c == 0xBE) { printf("3/4"); i+= 3; }
            else if (c == 0xC6) { printf("AE"); i+= 2; }
            else if (c == 0xE6) { printf("ae"); i+= 2; }
            else {
                if (c == '\v' || c == '\f') c = '\n';
                else if (c > 0x7E) { c = trans[c - 0x7F]; }
                /* line wrap */
                if ((i > 70) && c == ' ') c = '\n';
                putchar(c);
                i ++;
            }
            if (c == '\n') i = 0;
        }
    }
return 0;
}