/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2013 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.wcm.offline;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;

/**
 * An implementation of {@link TextDocumentImporter} for "classic" MS Word
 * documents ("doc").
 */
public class DocImporter implements TextDocumentImporter {

    private final HWPFDocument document;
    private final Range range;

    public DocImporter(InputStream stream) throws TextImportException {
        try {
            this.document = new HWPFDocument(stream);
            this.range = document.getRange();
        } catch (OfficeXmlFileException ex) {
            throw new TextImportException("this is a docx file", ex);
        } catch (IOException ex) {
            throw new TextImportException(ex.getMessage(), ex);
        }
    }

    /**
     * @return contents of first non-empty paragraph
     */
    public String getTitle() {
        for (int i = 0; i < this.range.numParagraphs(); i++) {
            org.apache.poi.hwpf.usermodel.Paragraph p = this.range.getParagraph(i);
            String t = p.text().trim();
            if (!"".equals(t)) {
                return t;
            }
        }
        return null;
    }

    public int getNumberOfParagraphs() {
        return this.range.numParagraphs();
    }

    public Paragraph getParagraph(int index) {
        return new DocParagraph(this.range.getParagraph(index));
    }

    private class DocParagraph implements Paragraph {

        private final org.apache.poi.hwpf.usermodel.Paragraph p;
        private final String text;
        private final String textHTML;
        private final List<Picture> pictures;

        public DocParagraph(org.apache.poi.hwpf.usermodel.Paragraph paragraph) {
            this.p = paragraph;

            StyleDescription sd = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
            String classname = WordStyleSupport.makeClassName(sd.getName());
            String container = "p";
            String elemname = WordStyleSupport.toHtmlElement(classname);
            if (elemname != null) {
                container = elemname;
                classname = null;
            }

            StringBuilder sbtext = new StringBuilder();
            StringBuilder sbhtml = new StringBuilder();
            List<Picture> pics = new ArrayList<Picture>();

            for (int i = 0; i < this.p.numCharacterRuns(); i++) {
                CharacterRun characters = this.p.getCharacterRun(i);

                if (document.getPicturesTable().hasPicture(characters)) {
                    pics.add(new DocPicture(document.getPicturesTable().extractPicture(characters, true)));
                } else {
                    String contents = characters.text();
                    boolean onlyWhiteSpace = contents.trim().length() == 0;

                    if (!onlyWhiteSpace) {
                        if (characters.isBold()) {
                            sbhtml.append("<b>");
                        }
                        if (characters.isItalic()) {
                            sbhtml.append("<i>");
                        }
                        if (characters.isStrikeThrough()) {
                            sbhtml.append("<del>");
                        }
                        if (characters.getUnderlineCode() != 0) {
                            sbhtml.append("<u>");
                        }
                        if (characters.getSubSuperScriptIndex() == 1) {
                            sbhtml.append("<sup>");
                        }
                        if (characters.getSubSuperScriptIndex() == 2) {
                            sbhtml.append("<sub>");
                        }
                    }

                    sbtext.append(contents);
                    sbhtml.append(HtmlUtil.escapeHtmlText(contents));

                    if (!onlyWhiteSpace) {
                        if (characters.getSubSuperScriptIndex() == 2) {
                            sbhtml.append("</sub>");
                        }
                        if (characters.getSubSuperScriptIndex() == 1) {
                            sbhtml.append("</sup>");
                        }
                        if (characters.getUnderlineCode() != 0) {
                            sbhtml.append("</u>");
                        }
                        if (characters.isStrikeThrough()) {
                            sbhtml.append("</del>");
                        }
                        if (characters.isItalic()) {
                            sbhtml.append("</i>");
                        }
                        if (characters.isBold()) {
                            sbhtml.append("</b>");
                        }
                    }
                }
            }

            String result = sbhtml.toString().trim();

            if (result.length() > 0) {
                StringBuilder tmp = new StringBuilder();
                tmp.append("<");
                tmp.append(container);
                String style = null;
                if (this.p.getJustification() == 2) {
                    style = "text-align: right;";
                } else if (this.p.getJustification() == 1) {
                    style = "text-align: center;";
                }
                if (style != null) {
                    tmp.append(" style='" + style + "'");
                }
                if (classname != null) {
                    tmp.append(" class='" + HtmlUtil.escapeHtmlAttr(classname) + "'");
                }

                tmp.append(">");

                tmp.append(result);

                tmp.append("</");
                tmp.append(container);
                tmp.append(">");

                result = tmp.toString();
            }

            this.text = sbtext.toString().trim();
            this.textHTML = result;
            this.pictures = Collections.unmodifiableList(pics);
        }

        public String getText() {
            return this.text;
        }

        public String getHTML() {
            return this.textHTML;
        }

        public List<Picture> getPictures() {
            return this.pictures;
        }
    }

    private class DocPicture implements Picture {

        private final org.apache.poi.hwpf.usermodel.Picture picture;

        public DocPicture(org.apache.poi.hwpf.usermodel.Picture picture) {
            this.picture = picture;
        }

        public String getMediaType() {
            return this.picture.getMimeType();
        }

        public byte[] getBytes() {
            return this.picture.getContent();
        }

        @Override
        public String toString() {
            return getMediaType() + " (" + getBytes().length + " bytes)";
        }
    }
}
