/* ****************************************************************************
 *
 *	File: ParseOps.java
 *
 * ****************************************************************************
 *
 *	ADOBE CONFIDENTIAL
 *	___________________
 *
 *	Copyright 2005 Adobe Systems Incorporated
 *	All Rights Reserved.
 *
 *	NOTICE: All information contained herein is, and remains the property of
 *	Adobe Systems Incorporated and its suppliers, if any. The intellectual
 *	and technical concepts contained herein are proprietary to Adobe Systems
 *	Incorporated and its suppliers and may be covered by U.S. and Foreign
 *	Patents, patents in process, and are protected by trade secret or
 *	copyright law. Dissemination of this information or reproduction of this
 *	material is strictly forbidden unless prior written permission is obtained
 *	from Adobe Systems Incorporated.
 *
 * ***************************************************************************/
package com.adobe.internal.pdftoolkit.core.util;

import java.io.IOException;

import com.adobe.internal.io.stream.InputByteStream;
import com.adobe.internal.pdftoolkit.core.exceptions.PDFParseException;
import com.adobe.internal.pdftoolkit.core.types.ASName;
import com.adobe.internal.pdftoolkit.core.types.ASNumber;
import com.adobe.internal.pdftoolkit.core.types.ASString;

/**
 * This class provides many utility methods related to parsing of stream.
 */
public final class ParseOps {

	private static final int DEFAULT_CHUNK_SIZE = 512;
	private static ThreadLocal<byte[]> tlBuffer = new ThreadLocal<byte[]>(){
		protected synchronized byte[] initialValue() {
			return new byte[DEFAULT_CHUNK_SIZE];
		}
	};
	
	private ParseOps(){}
	
	/**
	 *
	 * Parses a hexidecimal character string into a byte array. A pair of
	 * hex characters forms the value stored in a single byte of the array.
	 * Hexidecimal strings are defined in section 3.2.3 of the PDF Reference
	 * Manual version 1.4.
	 *
	 * @param buf           Buffer to parse
	 *
	 * @return Data corresponding to the hex string
	 * @throws IOException
	 * @throws PDFParseException     If there is a syntax error.
	 */
	public static byte[] readHex(InputByteStream buf)
		throws IOException, PDFParseException
	{
		// FIXME_IO - check for EOF
		byte b = (byte)buf.read();
		int begin = (int)buf.getPosition() - 1;
		int whiteSpaceCount = 0;

		// Find the extent of the hex string
		while (b != '>') {
			if (!ByteOps.isHexDigit(b))
				if (!ByteOps.isWhitespace(b)) {
					throw new PDFParseException("Expected Hex Digit" + Long.toString(buf.getPosition() - 1));
				} else {
					whiteSpaceCount++;
				}
			b = (byte)buf.read();
		}
		int end = (int)buf.getPosition() - 1;

		// Parse the string.
		return hexToByteArray(buf, begin, end, whiteSpaceCount);
	}
	
	/**
	 * Skips a hexidecimal character string. A pair of hex characters forms 
	 * the value stored in a single byte of the array. Hexidecimal strings 
	 * are defined in section 3.2.3 of the PDF Reference
	 * Manual version 1.4.
	 *
	 * @param buf           Buffer to parse
	 *
	 * @throws IOException
	 * @throws PDFParseException     If there is a syntax error.
	 */
	public static void skipHex(InputByteStream buf)
		throws IOException, PDFParseException
	{
		byte b = (byte)buf.read();
		// Find the extent of the hex string
		while (b != '>') {
			if (!ByteOps.isHexDigit(b) && !ByteOps.isWhitespace(b))
			{
				throw new PDFParseException("Expected Hex Digit" + Long.toString(buf.getPosition() - 1));
			} 
			b = (byte)buf.read();
		}
	}

	/**
	 * This API skips all the literals in the stream passed.
	 */
	public static void skipLiteral(InputByteStream inBuf)
		throws IOException
	{
		int level = 1;
		byte cur;
		do {
			cur = (byte)inBuf.read();
			if (cur == '(') {
				level++;
			} else if (cur == ')') {
				level--;
			} else if (cur == '\\') {
				cur = (byte)inBuf.read();
				if(cur == '\r') continue;
				if (cur >= '0' && cur <= '7') {
					cur -= '0';
					byte c = (byte)inBuf.read();
					if (c >= '0' && c <= '7') {
						c -= '0';
						byte d = (byte)inBuf.read();
						if (d >= '0' && d <= '7') {
							inBuf.read();  // to make unget work below.
							// FIXME_IO check for EOF
						}
					}
					inBuf.unget();  // we read one too many bytes in all branches
				} 
			}
		} while (level != 0 && !inBuf.eof());

	}
	
	/**
	 * This API reads the literals and returns as byte array.
	 * @throws PDFParseException 
	 */
	public static byte[] readLiteral(InputByteStream inBuf)
		throws IOException, PDFParseException
	{
		byte[] dest = new byte[16];
		int destIndex = 0;
		int level = 1;
		byte cur;
		do {
			int a = inBuf.read();
			if(a == -1)
			{
				throw new PDFParseException("EOF occured while reading a literal from content stream.");
			}
			cur = (byte)a;
			if (cur == '(') {
				level++;
			} else if (cur == ')') {
				level--;
			} else if (cur == '\\') {
				cur = (byte)inBuf.read();
				
//				The CARRIAGE RETURN (0Dh) and LINE FEED (0Ah) characters, also called newline characters, shall be
//				treated as end-of-line (EOL) markers. The combination of a CARRIAGE RETURN followed immediately by a
//				LINE FEED shall be treated as one EOL marker.
				
//				A conforming reader shall disregard the REVERSE SOLIDUS and the end-of-line marker following it when
//				reading the string;
				if(cur == '\n') continue; // EOL found, ignore
				if(cur == '\r') // EOL found. Check if EOL is \r\n. In that case read and ignore \n as well
				{
					cur = (byte)inBuf.read();
					if(cur != '\n')
					{
						inBuf.unget(); // EOL was not \r\n. Unget the byte.
					}
					continue;
					
				}
				if (cur >= '0' && cur <= '7') {
					cur -= '0';
					byte c = (byte)inBuf.read();
					if (c >= '0' && c <= '7') {
						c -= '0';
						byte d = (byte)inBuf.read();
						if (d >= '0' && d <= '7') {
							d -= '0';
							cur = (byte)(cur * 64 + c * 8 + d);
							inBuf.read();  // to make unget work below.
						} else {
							cur = (byte)(cur * 8 + c);
						}
					}
					inBuf.unget();  // we read one too many bytes in all branches
				} else if (cur == 'n') {
					cur = '\n';
				} else if (cur == 'r') {
					cur = '\r';
				} else if (cur == 't') {
					cur = '\t';
				} else if (cur == 'b') {
					cur = '\b';
				} else if (cur == 'f') {
					cur = '\f';
				} else if (cur == '(') {
					cur = '(';
				} else if (cur == ')') {
					cur = ')';
				} else if (cur == '\\') {
					cur = '\\';
				}
			}
			if (destIndex == dest.length) {
				byte[] newDest = new byte[dest.length * 2];
				System.arraycopy(dest, 0, newDest, 0, dest.length);
				dest = newDest;
			}
			dest[destIndex++] = cur;
		} while (level != 0);

		// we stored the final ')' in the buffer, ignore it.
		byte[] asStr = new byte[destIndex - 1];
		for(int i = 0; i < destIndex - 1; i++)
			asStr[i] = dest[i];
		return asStr;
	}

	/** 
	 * Parse a hexidecimal digit from the PDF byte stream.
	 *
	 * @param buf
	 *            PDF data buffer to parse
	 *
	 * @return A single hexidecimal digit.
	 * @throws IOException
	 * @throws PDFParseException
	 *
	 */
	static byte readHexDigit(InputByteStream buf)
		throws IOException, PDFParseException
	{
		// FIXME_IO - check for EOF!
		return ByteOps.getHex((byte) buf.read());
	}

	/** 
	 * Parses an atom from the PDF byte stream. An atom can be thought of as a
	 * COS name as defined in section 3.2.4 of the PDF Reference Manual version
	 * 1.4.
	 *
	 * @param buf
	 *            PDF data buffer to parse
	 *
	 * @return ASName parsed from the buffer
	 * @throws IOException
	 * @throws PDFParseException
	 *
	 *
	 */
	public static ASName readName(InputByteStream buf)
		throws IOException, PDFParseException
	{
		byte[] tmp = tlBuffer.get();
		int end = 0;
		int cur = buf.read();
		while (ByteOps.isRegular((byte) cur)) {
			// Handle escaped names.
			if (cur == '#')
				cur = readHexDigit(buf) * 16 + readHexDigit(buf);

			// Build up the name.
			tmp[end++] = (byte) cur;
			if (end >= tmp.length)
				break;
			if (buf.eof())
				break;
			cur = buf.read();
		}

		// Back up to keep the subsequent whitespace or delimeter character.
		// If the last byte is regular, it means we've hit the end of stream.
		if (!ByteOps.isRegular((byte) cur))
			buf.unget();

		// Create the atom if it does not already exist.
		return ASName.getName(tmp, 0, end);
	}
		
	/**
	 * Skips an atom from the PDF byte stream. An atom can be thought of as a
	 * COS name as defined in section 3.2.4 of the PDF Reference Manual version
	 * 1.4.
	 *
	 * @param buf
	 *            PDF data buffer to parse
	 * @throws IOException
	 * @throws PDFParseException
	 */
	public static void skipName(InputByteStream buf)
		throws IOException, PDFParseException
	{
		int cur = buf.read();
		while (ByteOps.isRegular((byte) cur)) {
			// Handle escaped names.
			if (cur == '#')
			{
				buf.read();
				buf.read();
			}
			if (buf.eof())
				break;
			cur = buf.read();
		}

		// Back up to keep the subsequent whitespace or delimeter character.
		// If the last byte is regular, it means we've hit the end of stream.
		if (!ByteOps.isRegular((byte) cur))
			buf.unget();
	}

	/**
	 * Parses a numerical value from the PDF byte stream. A numeric object is
	 * defined in section 3.2.2 of the PDF Reference Manual version 1.4.
	 *
	 * @param buf
	 *            Byte stream to parse
	 * @param first
	 *            Starting byte to parse
	 *
	 * @return Parsed number.
	 * @throws IOException
	 * @throws PDFParseException
	 *
	 */
	public static ASNumber readNumber(byte first, InputByteStream buf)
		throws PDFParseException, IOException
	{
		boolean decimalPointEncountered = false;
		byte b;
		int end = 0;
		byte[] tmp = new byte[512];

		// FIXME_IO - check for EOF!
		b = first;
		if ((b == '-') || (b == '+')) {
			tmp[end++] = b;
			b = (byte) buf.read();
		}
		while (ByteOps.isDigit(b)) {
			tmp[end++] = b;
			if (end >= tmp.length)
				break;
			if (buf.eof())
				break;
			b = (byte) buf.read();
		}
		if (b == '.') {
			decimalPointEncountered = true;
			tmp[end++] = b;
			if (!buf.eof()) {
				// FIXME_IO - check for EOF!
				b = (byte) buf.read();
				while (ByteOps.isDigit(b)) {
					tmp[end++] = b;
					if (buf.eof())
						break;
					b = (byte) buf.read();
				}
			}
		}

		// While tokenizing PDF files from PDF Oracle, few PDFs have been encountered which have
		// incorrect numbers eg: 123.4.4.4, 123.-1 etc etc.
		// Most frequent among these patterns is of form xxx.-yyy (eg : 123.-123)
		// Acrobat simply ignores digits after decimal place in these cases. 
		
		// We expect whitespace or a delimiter at the end of the number.
		// If we still see a digit, then it means were at the end of stream.
		if (ByteOps.isRegular(b) && !ByteOps.isDigit(b))
		{
			// If a non-digit was encountered throw an exception if 
			// NOT ( decimal point was encountered and next char is -) 
			if(! (decimalPointEncountered && b == '-'))
			{
				throw new PDFParseException("Expected a number at position - " + Long.toString(buf.getPosition() - 1));
			}else
				while(!ByteOps.isWhitespace(b) && !buf.eof()){b = (byte) buf.read();}// ignore the non-whitespace characters after "-"
																	   // in the decimal numbers with pattern *.-* 
																	   // For example:in case of 9.0-5.0, 5.0 should be ignored
																		// and result should be 9.0
		}

		// Create a string to hold the value.
		ASString strVal;
		byte[] strBuf = new byte[end];
		for (int i = 0; i < strBuf.length; i++)
			strBuf[i] = tmp[i];
		strVal = new ASString(strBuf);

		// Create the string value wrapper.
		ASNumber number = new ASNumber(strVal);

		// Back up to keep the subsequent whitespace or delimeter character.
		// If the last byte is regular, it means we've hit the end of stream.
		if (!ByteOps.isRegular(b))
			buf.unget();
		return number;
	}
	
	/**
	 * Skips over a numerical value from the PDF byte stream. A numeric object is
	 * defined in section 3.2.2 of the PDF Reference Manual version 1.4.
	 *
	 * @param buf
	 *            Byte stream to parse
	 * @param first
	 *            Starting byte to parse
	 * @throws IOException
	 * @throws PDFParseException
	 *
	 */
	public static void skipNumber(byte first, InputByteStream buf)
		throws PDFParseException, IOException
	{
		byte b;
		b = first;
		if ((b == '-') || (b == '+')) {
			b = (byte) buf.read();
		}
		while (ByteOps.isDigit(b)) {
			if (buf.eof())
				break;
			b = (byte) buf.read();
		}
		if (b == '.' && !buf.eof()) {			
				b = (byte) buf.read();
				while (ByteOps.isDigit(b)) {
					if (buf.eof())
						break;
					b = (byte) buf.read();
				}			
		}
		
		// Back up to keep the subsequent whitespace or delimeter character.
		// If the last byte is regular, it means we've hit the end of stream.
		if (!ByteOps.isRegular(b))
			buf.unget();
	}

	/**
	 * Skips over whitespace in the input stream. Whitespace is defined in
	 * section 3.1.1 of the PDF Reference Manual version 1.4. For the purposes
	 * of this method, comments are also considered whitespace.
	 *
	 * @param buf
	 *            Buffer to parse
	 *
	 * @return The first non-whitespace byte encountered.
	 * @throws IOException
	 */
	public static byte skipWhitespace(InputByteStream buf)
		throws IOException
	{
		byte b = ' ';
		while (ByteOps.isWhitespace(b) && !buf.eof()) {
			b = (byte) buf.read();
			if (b == (byte) '%') {
				while ((b != (byte) '\r') && (b != (byte) '\n') && !buf.eof())
					b = (byte) buf.read();
				if (buf.eof())
					b = ' ';
			}
		}
		return b;
	}

	/**
	 * Performs the actual parsing of a string of hexidecimal characters.
	 * Each character pair is converted into a single byte value and placed
	 * in the returned byte array.
	 *
	 * @param buf		Buffer to parse
	 * @param begin	Beginning position of the string in the buffer
	 * @param end		Ending position of the string in the buffer
	 * @param whiteSpaceCount Number of white space bytes in buf
	 *
	 * @return Data corresponding to the hex string.
	 * @throws IOException
	 * @throws PDFParseException
	 */
	static byte[] hexToByteArray(InputByteStream buf, int begin, int end, int whiteSpaceCount)
		throws IOException, PDFParseException
	{
		byte[] rslt = new byte[((end - begin - whiteSpaceCount + 1) / 2)];
		int idx = 0;
		byte bH, bL;
		buf.seek(begin);
		for (int i = begin; i < end;) {
			// FIXME_IO check for EOF
			do {
				bH = (byte)buf.read();
				i++;
			} while (!ByteOps.isHexDigit(bH) && (i<end));
			if (ByteOps.isHexDigit(bH)) {
				int h = ByteOps.getHex(bH);
				int l = 0;
				if (i<end) {
					do {
						bL = (byte)buf.read();
						i++;
					} while (!ByteOps.isHexDigit(bL) && (i<end));
					if (ByteOps.isHexDigit(bL))
						l = ByteOps.getHex(bL);
				}
				rslt[idx++] = (byte)(h * 16 + l);				
			}
		}
		buf.seek(buf.getPosition() + 1); // end+1 ???
		if (idx < rslt.length) {
			byte[] shortRslt = new byte[idx];
			for (int i=0; i<idx; i++)
				shortRslt[i] = rslt[i];
			return shortRslt;
		}
		return rslt;
	}	
	
	/**
	 * Skips over a string of hexidecimal characters.
	 * Each character pair is converted into a single byte value and placed
	 * in the returned byte array.
	 *
	 * @param buf		Buffer to parse
	 * @param begin	Beginning position of the string in the buffer
	 * @param end		Ending position of the string in the buffer
	 * @param whiteSpaceCount Number of white space bytes in buf
	 *
	 * @throws IOException
	 * @throws PDFParseException
	 */
	static void skipHexToByteArray(InputByteStream buf, int begin, int end, int whiteSpaceCount)
		throws IOException, PDFParseException
	{
		byte bH, bL;
		buf.seek(begin);
		for (int i = begin; i < end;) {
			do {
				bH = (byte)buf.read();
				i++;
			} while (!ByteOps.isHexDigit(bH) && (i<end));
			if (ByteOps.isHexDigit(bH) && i<end) {				
					do {
						bL = (byte)buf.read();
						i++;
					} while (!ByteOps.isHexDigit(bL) && (i<end));								
			}
		}
		buf.seek(buf.getPosition() + 1); // end+1 ???
	}

	/**
	 * Gets the position of endobj keyword in this stream. If EOL is
	 * encountered before finding "endobj" -1 will be returned. 
	 * @param mBuf
	 * @return long
	 * @throws IOException
	 */
	public static long getEndObjPos(InputByteStream mBuf ) throws IOException
	{
		for (int b = mBuf.read(); b != InputByteStream.EOF;)
		{
			if (b == 'e')
			{
				if ((b = mBuf.read()) == 'n' &&  
					(b = mBuf.read()) == 'd' && 
					(b = mBuf.read()) == 'o' && 
					(b = mBuf.read()) == 'b' && 
					(b = mBuf.read()) == 'j')
				{
					return mBuf.getPosition() - 6;
				}
			} else
			{
				b = mBuf.read();
			}
		}
		return -1;
	}
}
