/*  AO3Scraper 2.0: Scrapes AO3 to extract work statistics
    Copyright (C) 2013  John Elliott  <jce@seasip.demon.co.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package info.seasip.ao3scraper;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import javax.swing.JTable;
import javax.swing.SwingUtilities;
import javax.swing.table.DefaultTableModel;

/** This class handles the business of downloading and parsing HTML from AO3.
 * It runs as a separate thread so the UI remains responsive (though as the 
 * program is currently constituted, there is nothing the UI can really do
 * while the download is in progress).
 *
 * @author John Elliott  jce@seasip.demon.co.uk
 */
public class Downloader extends Thread {
    
    // AO3 username
    private String user;            

    // Listeners to our events 
    private ArrayList listeners; 

    // Number of works (giving number of pages to parse)
    private int works;              
    
    // List of parsed works.
    ArrayList workList = new java.util.ArrayList(); 

    // Set to true to interrupt a download
    private boolean stopped = false;
    
    /** Constructor for the downloader.
     * 
     * @param u AO3 username.
     */
    public Downloader(String u) {
        user = u;   
        listeners = new ArrayList();
    }
    
    /** Get the current username.
     * 
     * @return The username with which this Downloader was constructed.
     */
    public String getUsername()
    {
        return user;
    }
    
    
    /** Add a listener which will be called when this downloader succeeds,
     * fails or reports an operation in progress.
     * 
     * @param listener The listener that will receive events.
     */
    public void addDownloadListener(DownloaderEventListener listener) {
        listeners.add(listener);
    }
    
    
    /* Event types -- used by DownloaderEventHandler */
    static final int DONE = 1;
    static final int FAILED = 2;
    static final int PROGRESS = 3;
    
    /** Distribute a 'Done' event to all listeners. */
    private void done() {
        int size = workList.size();
        String message;
        
        if (size == 1) message = "1 work found.";
        else           message = Integer.toString(size) + " works found.";
        
        for (int n = 0; n < listeners.size(); n++) {
            DownloaderEventListener listener = (DownloaderEventListener)(listeners.get(n));
            Runnable e = new DownloaderEventHandler(DONE, 
                    this, listener, message);
            SwingUtilities.invokeLater(e);
        }
    }

    
    /** Distribute a 'Failed' event to all listeners. */
    private void failed(String err) {
        for (int n = 0; n < listeners.size(); n++) {
            DownloaderEventListener listener = (DownloaderEventListener)(listeners.get(n));
            Runnable e = new DownloaderEventHandler(FAILED,
                    this, listener, err);
            SwingUtilities.invokeLater(e);
        }
    }
    

    /** Distribute an 'Operation in progress' event to all listeners. */
    private void progress(String s, int tot, int cur) {
    for (int n = 0; n < listeners.size(); n++) {
            DownloaderEventListener listener = (DownloaderEventListener)(listeners.get(n));
            Runnable e = new DownloaderEventHandler(PROGRESS,
                    this, listener, s, tot, cur);
            SwingUtilities.invokeLater(e);
        }
    }
    
    
    /** Read a page into a string.
     * 
     * @param rdr A BufferedReader (assumed to be reading an HTTP connection).
     * @return The page read, as a String.
     * @throws IOException if reading failed for any reason.
     */
    private String readPage(BufferedReader rdr) throws IOException {
        StringBuilder b = new StringBuilder();
        String line;
        
        while ( (line = rdr.readLine()) != null) {
            b.append(line);
        }
        return b.toString();
    }
    
    
    /** Count the number of works AO3 lists for an author.
     * 
     * @param page An HTML page listing works.
     * @return true if the number of works could be parsed, false otherwise.
     */
    private boolean countWorks(String page) {
        int n0;
        // Find the string "nnn works by author".
        int idx = page.indexOf(" Works by");
        if (idx <= 0) {
            return false;
        }
        // Then backtrack to the beginning of the number that should 
        // precede "Works by".
        n0 = idx;
        while (n0 > 0 && Character.isDigit(page.charAt(n0 - 1))) {
            --n0;
        }
        // Split out the string containing number of works
        String wks = page.substring(n0, idx);
        try {
            works = Integer.parseInt(wks); // And parse as an integer.
        }
        catch (Exception ex) {
            return false;
        }
        return true;
    }
    
    // Some snippets of AO3 HTML.
    private static final String HEADER_MODULE = "<div class=\"header module\">";
    private static final String DATETIME      = "<p class=\"datetime\">";
    
    /** Parse a header block for a specified statistic.
     * 
     * @param header The HTML header for a work.
     * @param title The title of the statistic to parse.
     * @return The value of the statistic, or 0 if the title was not matched.
     */
    private int parseStat(String header, String title) {
        int idx1, idx2;
        int bracketDepth = 0;
        
        // Statistics are all in <dt> / <dd> block pairs; so look for the <dt>.
        String match = "<dt>" + title + ":";   
        idx1 = header.indexOf(match);
        if (idx1 <= 0) {
            return 0;       // Statistic not found
        }
        // We're now at <dt>Statistic:. Move to the end of that.
        idx1 += match.length();
        
        // And then parse the next integer that is not inside an HTML <tag>.
        while (idx1 < header.length()) {
            char ch = header.charAt(idx1);
  
            if (ch == '<') ++bracketDepth;  // Start of tag
            if (ch == '>') --bracketDepth;  // End of tag
            if (Character.isDigit(ch) && bracketDepth == 0) {
                // OK. We've found a number that isn't inside an HTML tag.
                // Now parse it. AO3 formats numbers with commas, so we
                // can't just hand off to Integer.parseInt; copy to a 
                // StringBuilder, chopping out commas as we go.
                StringBuilder b = new StringBuilder();
                idx2 = idx1;
                char ch2;
                
                // Go up to the next HTML tag (which should be </dd>) 
                // keeping only digits.
                while ( (ch2 = header.charAt(idx2)) != '<') {
                    if (Character.isDigit(ch2)) {
                        b.append(ch2);
                    }
                    ++idx2;
                }
                // And parse the digits we got.
                return Integer.parseInt(b.toString());
            }
            ++idx1;
        }
        return 0;
      }
    
    // Parse the header for a single work, and add it to the array.
    private void parseHeader(String header) {
        int idx1, idx2;
        Work work = new Work(); 
        
        idx1 = header.indexOf(DATETIME);    // Is there a timestamp?
        if (idx1 >= 0) {
            idx2 = idx1 + DATETIME.length();    // Good. 
            while(header.charAt(idx2) == ' ') ++idx2;
            
            // Parse the date. Java makes this easier for us than C,  
            // thanks to SimpleDateFormat.
            SimpleDateFormat format = new SimpleDateFormat("dd MMM yyyy");
            try {
                work.date = format.parse(header.substring(idx2));

                // Calculate the work's age, as today less work date.
                Date today = new Date();
                long diff = today.getTime() - work.date.getTime();
                work.age = (int)(diff / (1000 * 60 * 60 * 24));
            }
            catch (ParseException pe) {
                // Ignore date parse error.
            }
        }
        // Now find the title, which is inside an <a> link to the work.
        idx1 = header.indexOf("<a href=\"/works/");
        if (idx1 >= 0) {
            // Search for the end of the <a tag.
            idx2 = header.indexOf('>', idx1);
            if (idx2 >= 0) {
                // And take characters up to the beginning of the </a> tag.
                idx1 = header.indexOf('<', idx2);
                work.title = header.substring(idx2 + 1, idx1);
            }
        }
        // The other stats have a consistent form that's easy to parse.
        work.hits = parseStat(header, "Hits");
        work.bookmarks = parseStat(header, "Bookmarks");
        work.comments = parseStat(header, "Comments");
        work.kudos = parseStat(header, "Kudos");
        work.words = parseStat(header, "Words");
        workList.add(work);
    }
    
    /** Parse a page and extract details for each work on it (usually there are 
     * 20).
     * 
     * @param page The downloaded page.
     */
    private void parse(String page) {
        int idx1 = -1, idx2 = 0;
        String header;
        
        // The entry for each work begins with a header module
        while (true) {
            idx1 = page.indexOf(HEADER_MODULE, idx1 + 1);
            if (idx1 < 0) {
                return;
            }
            // Find the next work. If there is one, make a substring 
            // that goes up to it. Otherwise parse what's left as the
            // final entry.
            idx2 = page.indexOf(HEADER_MODULE, idx1 + 1);
            if (idx2 >= 0) {
                header = page.substring(idx1, idx2);
            }
            else {
                header = page.substring(idx1);
            }
            parseHeader(header);
        }
    }
    
    /** Populate an onscreen table with the results.
     * 
     * @param table The table to set up and fill.
     */
    public void fillTable(JTable table) {
        Object[][] rows = new Object[workList.size()][8];
        final Object[] headers = { 
            "Title", "Hits", "Kudos", "Comments",
            "Bookmarks", "Date", "Age", "Words"
        };
        
        for (int r = 0; r < workList.size(); r++) {
            Work wk = (Work)workList.get(r);
            rows[r][0] = wk.title;
            rows[r][1] = new Integer(wk.hits);
            rows[r][2] = new Integer(wk.kudos);
            rows[r][3] = new Integer(wk.comments);
            rows[r][4] = new Integer(wk.bookmarks);
            rows[r][5] = wk.formattedDate();
            rows[r][6] = new Integer(wk.age);
            rows[r][7] = new Integer(wk.words);
            
        }
        table.setModel(new DefaultTableModel(rows, headers));
    }
    
    /** Save the parsed statistics.
     * 
     * Files are saved in tab-separated ASCII.
     * 
     * @param output The file to which output should be saved.
     * @throws IOException on any error.
     */
    public void saveStats(File output) throws IOException
    {
        BufferedWriter out = new BufferedWriter (new FileWriter(output));
        
        out.write("Title\tHits\tKudos\tComments\tBookmarks\tDate\tAge\tWords\n");
        for (int n = 0; n < workList.size(); n++)
        {
            Work wk = (Work)workList.get(n);
            out.write(wk.title); out.write('\t');
            out.write(Integer.toString(wk.hits));  out.write('\t');
            out.write(Integer.toString(wk.kudos));  out.write('\t');
            out.write(Integer.toString(wk.comments));  out.write('\t');
            out.write(Integer.toString(wk.bookmarks));  out.write('\t');
            out.write(wk.formattedDate());  out.write('\t');
            out.write(Integer.toString(wk.age));  out.write('\t');
            out.write(Integer.toString(wk.words));  out.write('\n');
        }
        out.close();
    }

    /** Set the 'stopped' flag.*/
    public synchronized void onStop() {
        stopped = true;
    }
    
    
    /** Do the download.
     * The caller should implement DownloaderEventListener to receive 
     * results.
     */
    public void run() {
        // Clear any existing list of works.
        workList.clear();
        // First page to load
        String pageroot = "http://archiveofourown.org/users/" + user + "/works";
        String page;
        
        try {
            // Download the first page.
            progress("Downloading " + pageroot, 100, 0);
            URL url = new URL(pageroot);
            BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
            page = readPage(in);
            in.close();
            // It should list number of works.
            if (!countWorks(page)) {
                failed("Cannot find 'nnn Works by ' text in result.");
                return;
            }
            // A page lists 20 works. So keep going until we have read enough 
            // pages to cover all the works there should be.
            for (int n = 0; n < works; n += 20) {
                if (stopped) break;
                if (n > 0) {
                    // For pages other than the first, add a ?page=nn parameter to the URL.
                    String nextPageName = pageroot + "?page=" + Integer.toString((n/20)+1);
                    progress("Downloading " + nextPageName, works, n);
                    URL nextPage = new URL(nextPageName);
                    in = new BufferedReader(new InputStreamReader(nextPage.openStream()));
                    page = readPage(in);
                    in.close();
                }
                // Analyse the downloaded page.
                progress("Analysing", works, n);
                parse(page);
            }
        }
        catch(Exception ex) {
            // Any exception results in a 'failed' callback.
            failed(ex.getClass().getName() + ": " + ex.getMessage());
            return;
        }
        done();
    }
    
}
