antiword/findtext.c

/*
 * findtext.c
 * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
 *
 * Description:
 * Find the blocks that contain the text of MS Word files
 */

#include <stdio.h>
#include <stdlib.h>
#include "antiword.h"


/*
 * bAddTextBlocks - Add the blocks to the text block list
 *
 * Returns TRUE when successful, FALSE if not
 */
BOOL
bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
	BOOL bUsesUnicode, USHORT usPropMod,
	ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
{
	text_block_type	tTextBlock;
	ULONG	ulCharPos, ulOffset, ulIndex;
	long	lToGo;

	fail(ulTotalLength > (ULONG)LONG_MAX / 2);
	fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
	fail(aulBBD == NULL);

	NO_DBG_HEX(ulCharPosFirst);
	NO_DBG_DEC(ulTotalLength);

	if (bUsesUnicode) {
		/* One character equals two bytes */
		NO_DBG_MSG("Uses Unicode");
		lToGo = (long)ulTotalLength * 2;
	} else {
		/* One character equals one byte */
		NO_DBG_MSG("Uses ASCII");
		lToGo = (long)ulTotalLength;
	}

	ulCharPos = ulCharPosFirst;
	ulOffset = ulCharPosFirst;
	for (ulIndex = ulStartBlock;
	     ulIndex != END_OF_CHAIN && lToGo > 0;
	     ulIndex = aulBBD[ulIndex]) {
		if (ulIndex >= (ULONG)tBBDLen) {
			DBG_DEC(ulIndex);
			DBG_DEC(tBBDLen);
			werr(1, "The Big Block Depot is damaged");
		}
		if (ulOffset >= BIG_BLOCK_SIZE) {
			ulOffset -= BIG_BLOCK_SIZE;
			continue;
		}
		tTextBlock.ulFileOffset =
			(ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
		tTextBlock.ulCharPos = ulCharPos;
		tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
						(ULONG)lToGo);
		tTextBlock.bUsesUnicode = bUsesUnicode;
		tTextBlock.usPropMod = usPropMod;
		ulOffset = 0;
		if (!bAdd2TextBlockList(&tTextBlock)) {
			DBG_HEX(tTextBlock.ulFileOffset);
			DBG_HEX(tTextBlock.ulCharPos);
			DBG_DEC(tTextBlock.ulLength);
			DBG_DEC(tTextBlock.bUsesUnicode);
			DBG_DEC(tTextBlock.usPropMod);
			return FALSE;
		}
		ulCharPos += tTextBlock.ulLength;
		lToGo -= (long)tTextBlock.ulLength;
	}
	DBG_DEC_C(lToGo != 0, lToGo);
	return lToGo == 0;
} /* end of bAddTextBlocks */

/*
 * bGet6DocumentText - make a list of the text blocks of Word 6/7 files
 *
 * Code for "fast saved" files.
 *
 * Returns TRUE when successful, FALSE if not
 */
BOOL
bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
	const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
{
	UCHAR	*aucBuffer;
	ULONG	ulBeginTextInfo, ulTextOffset, ulTotLength;
	size_t	tTextInfoLen;
	int	iIndex, iType, iOff, iLen, iPieces;
	USHORT	usPropMod;

	DBG_MSG("bGet6DocumentText");

	fail(pFile == NULL);
	fail(aulBBD == NULL);
	fail(aucHeader == NULL);

	ulBeginTextInfo = ulGetLong(0x160, aucHeader);	/* fcClx */
	DBG_HEX(ulBeginTextInfo);
	tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader);	/* lcbClx */
	DBG_DEC(tTextInfoLen);

	aucBuffer = xmalloc(tTextInfoLen);
	if (!bReadBuffer(pFile, ulStartBlock,
			aulBBD, tBBDLen, BIG_BLOCK_SIZE,
			aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
		aucBuffer = xfree(aucBuffer);
		return FALSE;
	}
	NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);

	iOff = 0;
	while ((size_t)iOff < tTextInfoLen) {
		iType = (int)ucGetByte(iOff, aucBuffer);
		iOff++;
		if (iType == 0) {
			DBG_FIXME();
			iOff++;
			continue;
		}
		if (iType == 1) {
			iLen = (int)usGetWord(iOff, aucBuffer);
			vAdd2PropModList(aucBuffer + iOff);
			iOff += iLen + 2;
			continue;
		}
		if (iType != 2) {
			werr(0, "Unknown type of 'fastsaved' format");
			aucBuffer = xfree(aucBuffer);
			return FALSE;
		}
		/* Type 2 */
		iLen = (int)usGetWord(iOff, aucBuffer);
		NO_DBG_DEC(iLen);
		iOff += 4;
		iPieces = (iLen - 4) / 12;
		DBG_DEC(iPieces);
		for (iIndex = 0; iIndex < iPieces; iIndex++) {
			ulTextOffset = ulGetLong(
				iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
				aucBuffer);
			usPropMod = usGetWord(
				iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
				aucBuffer);
			ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
						aucBuffer) -
					ulGetLong(iOff + iIndex * 4,
						aucBuffer);
			NO_DBG_HEX_C(usPropMod != 0, usPropMod);
			if (!bAddTextBlocks(ulTextOffset, ulTotLength,
					bUsesUnicode, usPropMod,
					ulStartBlock,
					aulBBD, tBBDLen)) {
				aucBuffer = xfree(aucBuffer);
				return FALSE;
			}
		}
		break;
	}
	aucBuffer = xfree(aucBuffer);
	return TRUE;
} /* end of bGet6DocumentText */

/*
 * bGet8DocumentText - make a list of the text blocks of Word 8/97 files
 *
 * Returns TRUE when successful, FALSE if not
 */
BOOL
bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
	const ULONG *aulBBD, size_t tBBDLen,
	const ULONG *aulSBD, size_t tSBDLen,
	const UCHAR *aucHeader)
{
	const ULONG	*aulBlockDepot;
	UCHAR	*aucBuffer;
	ULONG	ulTextOffset, ulBeginTextInfo;
	ULONG	ulTotLength, ulLen;
	long	lIndex, lPieces, lOff;
	size_t	tTextInfoLen, tBlockDepotLen, tBlockSize;
	int	iType, iLen;
	BOOL	bUsesUnicode;
	USHORT	usPropMod;

	DBG_MSG("bGet8DocumentText");

	fail(pFile == NULL || pPPS == NULL);
	fail(aulBBD == NULL || aulSBD == NULL);
	fail(aucHeader == NULL);

  	ulBeginTextInfo = ulGetLong(0x1a2, aucHeader);	/* fcClx */
	DBG_HEX(ulBeginTextInfo);
	tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader);	/* lcbClx */
	DBG_DEC(tTextInfoLen);

	DBG_DEC(pPPS->tTable.ulSB);
	DBG_HEX(pPPS->tTable.ulSize);
	if (pPPS->tTable.ulSize == 0) {
		return FALSE;
	}

	if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
	  	/* Use the Small Block Depot */
		aulBlockDepot = aulSBD;
		tBlockDepotLen = tSBDLen;
		tBlockSize = SMALL_BLOCK_SIZE;
	} else {
	  	/* Use the Big Block Depot */
		aulBlockDepot = aulBBD;
		tBlockDepotLen = tBBDLen;
		tBlockSize = BIG_BLOCK_SIZE;
	}
	aucBuffer = xmalloc(tTextInfoLen);
	if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
			aulBlockDepot, tBlockDepotLen, tBlockSize,
			aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
		aucBuffer = xfree(aucBuffer);
		return FALSE;
	}
	NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);

	lOff = 0;
	while (lOff < (long)tTextInfoLen) {
		iType = (int)ucGetByte(lOff, aucBuffer);
		lOff++;
		if (iType == 0) {
			DBG_FIXME();
			lOff++;
			continue;
		}
		if (iType == 1) {
			iLen = (int)usGetWord(lOff, aucBuffer);
			vAdd2PropModList(aucBuffer + lOff);
			lOff += (long)iLen + 2;
			continue;
		}
		if (iType != 2) {
			werr(0, "Unknown type of 'fastsaved' format");
			aucBuffer = xfree(aucBuffer);
			return FALSE;
		}
		/* Type 2 */
		ulLen = ulGetLong(lOff, aucBuffer);
		if (ulLen < 4) {
			DBG_DEC(ulLen);
			return FALSE;
		}
		lOff += 4;
		lPieces = (long)((ulLen - 4) / 12);
		DBG_DEC(lPieces);
		for (lIndex = 0; lIndex < lPieces; lIndex++) {
			ulTextOffset = ulGetLong(
				lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
				aucBuffer);
			usPropMod = usGetWord(
				lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
				aucBuffer);
			ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
						aucBuffer) -
					ulGetLong(lOff + lIndex * 4,
						aucBuffer);
			if ((ulTextOffset & BIT(30)) == 0) {
				bUsesUnicode = TRUE;
			} else {
				bUsesUnicode = FALSE;
				ulTextOffset &= ~BIT(30);
				ulTextOffset /= 2;
			}
			NO_DBG_HEX_C(usPropMod != 0, usPropMod);
			if (!bAddTextBlocks(ulTextOffset, ulTotLength,
					bUsesUnicode, usPropMod,
					pPPS->tWordDocument.ulSB,
					aulBBD, tBBDLen)) {
				aucBuffer = xfree(aucBuffer);
				return FALSE;
			}
		}
		break;
	}
	aucBuffer = xfree(aucBuffer);
	return TRUE;
} /* end of bGet8DocumentText */