Alien-Judy

 view release on metacpan or  search on metacpan

src/judy-1.0.5/tool/jhton.c  view on Meta::CPAN

// Warning:  This cannot be used around compiler directives, such as
// "#include", nor in the case where Code contains a comma other than nested
// within parentheses or quotes.

#ifndef DEBUG
#define	DBGCODE(Code) // null.
#else
#define	DBGCODE(Code) Code
#endif


// ****************************************************************************
// MISCELLANEOUS GLOBAL VALUES:

#define	FUNCTION			// null; easy to find functions.
#define	FALSE	0
#define	TRUE	1
#define	CHNULL	('\0')
#define	PCNULL	((char *) NULL)
typedef int bool_t;			// for clarity with Boolean values.

char * gc_usage[] = {
    "usage: %s filename.htm[l]",
    "",
    "Reads restricted (Judy-specific) HTML from filename.htm[l] and emits",
    "equivalent nroff -man to stdout.",
    PCNULL,
};

char * gc_myname;			// how program was invoked.

#define	OKEXIT	0
#define	NOEXIT	0			// values for Error().
#define	ERREXIT	1
#define	USAGE	2
#define	NOERRNO	0

// Prefix for printf formats:

#define	FILELINE "File \"%s\", line %d: "

// Common error string:

char * FmtErrLineEnds = FILELINE "Input line ends within an HTML tag; for this "
			"translator, all tags must be on a single input line";

// Macros for skipping whitespace or non-whitespace; in the latter case,
// stopping at end of line or end of tag:

#define	SKIPSPACE(Pch)	   { while (ISSPACE(*(Pch))) ++(Pch); }

#define	SKIPNONSPACE(Pch)  { while ((! ISSPACE(*(Pch))) \
				 && (*(Pch) != CHNULL)  \
				 && (*(Pch) != '>')) ++(Pch); }

// Highest line number + 1, and last input line number that caused output:

int g_linenumlim;
int g_prevlinenum = 0;

// <PRE> block equivalents in nroff need some special handling for bold font
// and for continuing a tagged paragraph; these are bit flags:

#define	INPRE_BLOCK  0x1	// came from <PRE>.
#define	INPRE_BOLD   0x2	// came from <B><PRE>.
#define	INPRE_INDENT 0x4	// under <DL> below top level.


// ****************************************************************************
// DOCUMENT NODE TYPES:
//
// If an HTML tag is not in this list, it's unrecognized and causes a fatal
// error.  Otherwise the tag type (dn_type) is one of DN_TYPE_*, which are
// defined so the code can use them, but they MUST match the order of
// initialization of g_dntype[].
//
// Note:  The default node type is DN_TYPE_TEXT, that is, text outside of any
// tag.

enum {
	DN_TYPE_TEXT = 0,
	DN_TYPE_HTML,
	DN_TYPE_HEAD,
	DN_TYPE_TITLE,
	DN_TYPE_BODY,
	DN_TYPE_COMM,
	DN_TYPE_TABLE,
	DN_TYPE_TR,
	DN_TYPE_TD,
	DN_TYPE_DL,
	DN_TYPE_DT,
	DN_TYPE_DD,
	DN_TYPE_A,
	DN_TYPE_B,
	DN_TYPE_I,
	DN_TYPE_PRE,
	DN_TYPE_P,
	DN_TYPE_BR
};

// Regarding dnt_nest:  If an HTML tag type is marked as nesting, that means
// it is required not to be a singleton in this context; it must have a closing
// tag, and when the tree is built, the intervening text is nested as a child.
// Otherwise, intervening text is a sibling; a closing tag is allowed (whether
// or not this makes sense), but is not required; however, if present, it must
// match.

struct docnode_type {
	char *	dnt_tag;		// HTML tag.
	bool_t	dnt_savetag;		// flag: save HTML tag.
	bool_t	dnt_nest;		// flag: see comments above.
	int	dnt_type;		// corresponding number.
} g_dntype[] = {

// Note:  HTML is case-insensitive, but for expediency this program is
// case-sensitive.  Tags must be as shown below.

    { "",	FALSE, FALSE, DN_TYPE_TEXT,  },	// special, see above.

    { "HTML",	FALSE, TRUE,  DN_TYPE_HTML,  },
    { "HEAD",	FALSE, TRUE,  DN_TYPE_HEAD,  },

src/judy-1.0.5/tool/jhton.c  view on Meta::CPAN

	    {
		PUTS("\n.SH ");

		if ((Pdn->dn_Pchild->dn_type) == DN_TYPE_B)
		    (Pdn->dn_Pchild->dn_noemit) = TRUE;	 // skip <B>...</B>.
	    }

// If a <DT> immediately follows a previous <DT>, use .PD 0 for the successive
// .TP to join lines:

	    else
	    {
		if (((Pdn->dn_Pprev) != PDNNULL)
		 && ((Pdn->dn_Pprev->dn_type) == DN_TYPE_DT))
		{
		    PUTS("\n.PD 0\n");
		    suffix = "\n.PD\n";
		}

		PUTS("\n.TP 15\n.C ");
	    }

	    SETPREVNONL;
	    break;


// DESCRIPTIVE LIST DATUM:
//
// Just proceed to dump the embedded text.

	case DN_TYPE_DD: break;


// ANCHOR:
//
// Ignore inbound ("name") anchors and process outbound ("href") anchor labels
// into appropriately highlighted text.

	case DN_TYPE_A:
	{
	    size_t len;		// of substring.
	    Pdn_t  Pdn2;	// child node.
	    char * Pch;		// place in text.

	    assert((Pdn->dn_text) != PCNULL);

	    if (strstr(Pdn->dn_text, "name=") != PCNULL) break;

	    if (strstr(Pdn->dn_text, "href=") == PCNULL)
	    {
		Error(NOEXIT, NOERRNO, "Unrecognized HTML anchor type \"%s\" "
		      "at input line %d ignored; only \"name=\" and \"href=\" "
		      "are allowed by this translator",
		      Pdn->dn_text, Pdn->dn_linenum);
		break;
	    }

// Check for nested text (anchor label):
//
// TBD:  The error message lies a little.  If the text is something like,
// "foo<B>bar</B>", it passes this test; and later, all font tags in the anchor
// label are marked no-emit; and any other embedded tags, who knows what
// happens?

	    if (((Pdn2 = Pdn->dn_Pchild)->dn_type) != DN_TYPE_TEXT)
	    {
		Error(ERREXIT, NOERRNO, "HTML \"href\" anchor at input line "
		      "%d lacks a directly nested anchor label, with no "
		      "further nested tags; this translator cannot support "
		      "nested tags in anchor labels", Pdn->dn_linenum);
	    }
	    assert((Pdn2->dn_text) != PCNULL);

// If the anchor is within a <B><PRE>, do nothing special with fonts, as
// explained earlier:

	    if (ParentPre(Pdn, /* BoldOnly = */ TRUE)) break;

// Since anchor label text font will be forced in a moment, ignore any nested
// font directives so they don't mess up nroff:

	    MarkNoEmit(Pdn->dn_Pchild, /* Font = */ TRUE);

// See if anchor label appears to be a reference to the current page, to some
// other page, or else just make it italicized text:
//
// TBD:  This is pretty shaky, hope it's close enough.

	    len = strlen(PageName);

	    if (strncmp(Pdn2->dn_text, PageName, len) == 0)  // self-reference.
	    {
		CHECKPREV;
		PUTS("\\fB");			// bold font.
		SETPREVNONL;
		suffix = "\\fP";		// revert to previous font.
		break;
	    }

// Contains '(' and no whitespace => appears to reference some other page:
//
// Emit revised, tagged anchor label text immediately.

	    if (((Pch = strchr(Pdn2->dn_text, '(')) != PCNULL)
	     && NoWhiteSpace(Pdn2->dn_text))
	    {
		CHECKPREV;
		PUTS("\\fI");			// italic font.
		*Pch = CHNULL;			// terminate briefly.
		PUTS(Pdn2->dn_text);
		*Pch = '(';
		PUTS("\\fP");			// revert to previous font.
		PUTS(Pch);
		SETPREV(Pdn2->dn_text);

		(Pdn2->dn_noemit) = TRUE;	// skip later.
		break;
	    }

// Just make the anchor label italicized text:

	    CHECKPREV;
	    PUTS("\\fI");			// italic font.
	    SETPREVNONL;
	    suffix = "\\fP";			// revert to previous font.
	    break;

	} // case DN_TYPE_A


// BOLD TEXT:
//
// If the first child is <PRE>, use a "hard" font change; otherwise an in-line
// change.
//
// Note:  For <DT><B>, this node is already marked dn_noemit and not seen here.
//
// Note:  For <B><PRE>, nroff seems to reset font upon .PP, so mark the bold
// for later emission.

	case DN_TYPE_B:

	    if (((Pdn->dn_Pchild) != PDNNULL)
	     && ((Pdn->dn_Pchild->dn_type) == DN_TYPE_PRE))
	    {
		(Pdn->dn_Pchild->dn_bold) = TRUE;	// see above.
		break;
	    }

	    CHECKPREV;
	    PUTS("\\fB");			// bold font.
	    SETPREVNONL;
	    suffix = "\\fP";			// revert to previous font.
	    break;


// ITALIC TEXT:

	case DN_TYPE_I:

	    CHECKPREV;
	    PUTS("\\fI");			// italic font.
	    SETPREVNONL;
	    suffix = "\\fP";			// revert to previous font.
	    break;


// PREFORMATTED TEXT:
//
// Emit prefix/suffix directives based on example in strchr(3C).

	case DN_TYPE_PRE:

	    PUTS(UNDER_DL ? "\n.IP\n.nf\n.ps +1\n" : "\n.PP\n.nf\n.ps +1\n");
	    if (Pdn->dn_bold) PUTS(".ft B\n");	// deferred bold.
	    SETPREVNONL;
	    suffix = ((Pdn->dn_bold) ? "\n.ft P\n.ps\n.fi\n" : "\n.ps\n.fi\n");

	    // set for all children:
	    InPRE = INPRE_BLOCK
		  | ((Pdn->dn_bold) ? INPRE_BOLD   : 0)
		  | (UNDER_DL ?	      INPRE_INDENT : 0);
	    break;


// PARAGRAPH BREAK:
//
// If the parent is a <DL> below the top level, use .IP to continue a .TP
// (tagged paragraph); otherwise emit a standard .PP.

	case DN_TYPE_P:

	    PUTS(UNDER_DL ? "\n.IP\n" : "\n.PP\n");
	    SETPREVNONL;
	    break;


// LINE BREAK:

	case DN_TYPE_BR: PUTS("\n.br\n"); SETPREVNONL; break;


// UNRECOGNIZED DOCNODE TYPE:

	default:
	    Error(ERREXIT, NOERRNO, "Internal error: Unexpected docnode type "
		  "%d in docnodes tree", Pdn->dn_type);

	} // end switch on dn_type


// VISIT CHILD AND SIBLING DOCNODES:
//
// If this was a <DL> here, pass an incremented value to child nodes, but not
// to sibling nodes.

NextNode:
	if ((Pdn->dn_Pchild) != PDNNULL)
	    EmitNroffBody(Pdn->dn_Pchild, DLLevel + DLcount, InPRE, PageName);

	if (suffix != PCNULL) PUTS(suffix);

	if ((Pdn->dn_Pnext) != PDNNULL)
	    EmitNroffBody(Pdn->dn_Pnext, DLLevel, InPRE, PageName);

src/judy-1.0.5/tool/jhton.c  view on Meta::CPAN


	assert( Pch != PCNULL);
	assert(*Pch != CHNULL);

// Save type:

	(Pdn->dn_type) = DN_Type;

// Pass whitespace and then find the end of the tag:

	SKIPSPACE(Pch);

	if ((*Pch == CHNULL) || ((Pch2 = strchr(Pch, '>')) == PCNULL))
	    Error(ERREXIT, NOERRNO, FmtErrLineEnds, Filename, Linenum);

// Optionally save tag text:

	if (SAVETAG(DN_Type))
	{
	    *Pch2 = CHNULL;			// temporarily terminate.
	    (Pdn->dn_text) = StrSave(Pch);
	    *Pch2 = '>';
	}

	return(Pch2 + 1);

} // SaveDocNode()


// ****************************************************************************
// P A R E N T   P R E
//
// Given a docnode (can be null) and a flag whether only bold <PRE> is of
// interest, return TRUE if any of its parents is a <PRE> (marked for bold
// text), that is, DN_TYPE_PRE (with dn_bold set); otherwise return FALSE.

FUNCTION bool_t ParentPre(
	Pdn_t  Pdn,		// starting node.
	bool_t BoldOnly)	// flag: only care about bold <PRE>.
{
	if (Pdn == PDNNULL) return (FALSE);	// no parent.

	for (Pdn = Pdn->dn_Pparent; Pdn != PDNNULL; Pdn = Pdn->dn_Pparent)
	{
	    if (((Pdn->dn_type) == DN_TYPE_PRE)
	     && ((! BoldOnly) || (Pdn->dn_bold)))
	    {
		return(TRUE);
	    }
	}

	return(FALSE);

} // ParentPre()


// ****************************************************************************
// M A R K   N O   E M I T
//
// Given a docnode (can be null), and a flag, recursively mark the node and all
// children and siblings as do-not-emit, unless the flag is set, only mark font
// docnodes.

FUNCTION void MarkNoEmit(
	Pdn_t  Pdn,	// top node to mark.
	bool_t Font)	// flag: only mark font docnodes.
{
	if (Pdn == PDNNULL) return;

	if ((! Font)
	 || ((Pdn->dn_type) == DN_TYPE_B)
	 || ((Pdn->dn_type) == DN_TYPE_I))
	{
	    (Pdn->dn_noemit) = TRUE;
	}

	if ((Pdn->dn_Pchild) != PDNNULL) MarkNoEmit(Pdn->dn_Pchild, Font);
	if ((Pdn->dn_Pnext)  != PDNNULL) MarkNoEmit(Pdn->dn_Pnext,  Font);

} // MarkNoEmit()


// ****************************************************************************
// E M I T   T E X T
//
// Given a text string, a bitflag for <PRE> status, and an input line number
// for error reporting, copy the text string to stdout with no added newlines,
// but translating selected HTML escape codes to simple characters, doubling
// any backslashes, and if InPRE, inserting .IP (if INPRE_INDENT) or .PP at
// blank lines (between successive newlines), and if INPRE_BOLD, putting back
// bold font since .IP/.PP seems to reset the font.  Warn about unrecognized
// escape codes.

struct et_list {
	char * et_escape;	// expected text.
	size_t et_len;		// of expected text.
	char   et_emit;		// equivalent char.
} et_list[] = {
	{ "amp;", 4, '&', },
	{ "gt;",  3, '>', },
	{ "lt;",  3, '<', },
	{ PCNULL, 0, ' ', },	// end of list.
};

FUNCTION void EmitText(
	char * Pch,		// text to emit.
	int    InPRE,		// bitflag for <PRE> status.
	int    Linenum)		// for error reporting.
{
	char * Pch2;		// place in text.
	struct et_list * Pet;	// place in et_list[].

	while ((Pch2 = strchr(Pch, '&')) != PCNULL)	// another escape code.
	{
	    *Pch2 = CHNULL;		// briefly terminate.
	    EmitTextPRE(Pch, InPRE);	// emit preceding part.
	    *Pch2 = '&';
	    Pch = Pch2 + 1;			// past '&'.

	    for (Pet = et_list; Pet->et_escape != PCNULL; ++Pet)
	    {
		if (strncmp(Pch, Pet->et_escape, Pet->et_len) == 0)
		{
		    PUTC(Pet->et_emit);		// translate.
		    Pch += Pet->et_len;		// skip escapecode.
		    break;
		}
	    }

	    if (Pet->et_escape == PCNULL)	// no match found.
	    {
		Error(NOEXIT, NOERRNO, "Unrecognized HTML escape code in "
		      "line %d (or text beginning on that line): \"%.4s...\", "
		      "passed through unaltered", Linenum, Pch2);

		PUTC('&');		// emit start of escape code.
		// continue with Pch is just after the '&'.
	    }
	}

	EmitTextPRE(Pch, InPRE);	// emit remaining part.

} // EmitText()


// ****************************************************************************
// E M I T   T E X T   P R E
//
// Given a text string with no HTML escape codes in it and a bitflag for <PRE>
// status (see EmitText()), emit the string with <PRE> handling, and with any
// backslashes doubled.

FUNCTION void EmitTextPRE(
	char * Pch,		// string to emit.
	int    InPRE)		// bitflag for <PRE> status.
{
	char * Pch2;		// place in string.

	if (! InPRE) { EmitTextBS(Pch); return; }

	while ((Pch2 = strchr(Pch, '\n')) != PCNULL)	// another newline.
	{
	    *Pch2 = CHNULL;		// briefly terminate.
	    EmitTextBS(Pch);		// emit preceding part.
	    *Pch2 = '\n';
	    PUTC('\n');			// emit current newline.

	    if (*(Pch = Pch2 + 1) == '\n')	// successive newline.
	    {
		// emit before next newline:
		PUTS((InPRE & INPRE_INDENT) ? ".IP" : ".PP");

		// also reset font:
		if (InPRE & INPRE_BOLD) PUTS("\n.ft B");
	    }
	}

	EmitTextBS(Pch);		// emit trailing part.

} // EmitTextPRE()


// ****************************************************************************
// E M I T   T E X T   B S
//
// Given a text string with no HTML escape codes in it, emit the string with
// any backslashes doubled.

FUNCTION void EmitTextBS(
	char * Pch)		// string to emit.
{
	while (*Pch != CHNULL)
	{
	    PUTC(*Pch); if (*Pch == '\\') PUTC('\\');
	    ++Pch;
	}

} // EmitTextBS()


// ****************************************************************************
// N O   W H I T E   S P A C E
//
// Given a string, return TRUE if it contains no whitespace, otherwise FALSE.

FUNCTION bool_t NoWhiteSpace(
	char * Pch)	// string to check.
{
	assert(Pch != PCNULL);

	while (*Pch != CHNULL) { if (ISSPACE(*Pch)) return(FALSE); ++Pch; }
	return(TRUE);

} // NoWhiteSpace()


// ****************************************************************************
// C O U N T   N E W L I N E S
//
// Return the number of newline chars in a string.

FUNCTION int CountNewlines(
	char * Pch)	// in which to count newlines.
{
	int    count = 0;

	assert(Pch != PCNULL);

	while (*Pch != CHNULL) count += ((*Pch++) == '\n');
	return(count);

} // CountNewlines()



( run in 1.190 second using v1.01-cache-2.11-cpan-5735350b133 )