uai2008.bib

% Each inproceedings field are generated from https://dl.acm.org/doi/proceedings/10.5555/3023476.

@proceedings{UAI-2008,
    booktitle = {Proceedings of the 24th Conference on Uncertainty in Artificial Intelligence},
    name = {Uncertainty in Artificial Intelligence},
    shortname = {UAI},
    year = {2008},
    editor = {McAllester, David A. and Myllym{"a}ki, Petri},
    start = {2008-07-09},
    end = {2008-07-12},
    address = {The University of Helsinki City Centre Campus, Helsinki},
    published = {2024-10-30},
    conference_url = {https://www.auai.org/uai2008/},
    conference_number = {24},
    firstpublished = {2008-07-09},
}

@inproceedings{0,
abstract = {Many algorithms and applications involve repeatedly solving variations of the same inference problem; for example we may want to introduce new evidence to the model or perform updates to conditional dependencies. The goal of adaptive inference is to take advantage of what is preserved in the model and perform inference more rapidly than from scratch. In this paper, we describe techniques for adaptive inference on general graphs that support marginal computation and updates to the conditional probabilities and dependencies in logarithmic time. We give experimental results for an implementation of our algorithm, and demonstrate its potential performance benefit in the study of protein structure.},
author = {Acar, Umut A. and Ihler, Alexander T. and Mettu, Ramgopal R. and S\"{u}mer, \"{O}zg\"{u}r},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {1–8},
title = {Adaptive inference on general graphical models},
year = {2008}
}

@inproceedings{1,
abstract = {We present an algorithm that identifies the reasoning patterns of agents in a game, by iteratively examining the graph structure of its Multi-Agent Influence Diagram (MAID) representation. If the decision of an agent participates in no reasoning patterns, then we can effectively ignore that decision for the purpose of calculating a Nash equilibrium for the game. In some cases, this can lead to exponential time savings in the process of equilibrium calculation. Moreover, our algorithm can be used to enumerate the reasoning patterns in a game, which can be useful for constructing more effective computerized agents interacting with humans.},
author = {Antos, Dimitrios and Pfeffer, Avi},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {9–18},
title = {Identifying reasoning patterns in games},
year = {2008}
}

@inproceedings{10,
abstract = {Traditional multi-view learning approaches suffer in the presence of view disagreement, i.e., when samples in each view do not belong to the same class due to view corruption, occlusion or other noise processes. In this paper we present a multi-view learning approach that uses a conditional entropy criterion to detect view disagreement. Once detected, samples with view disagreement are filtered and standard multi-view learning methods can be successfully applied to the remaining samples. Experimental evaluation on synthetic and audio-visual databases demonstrates that the detection and filtering of view disagreement considerably increases the performance of traditional multi-view learning approaches.},
author = {Christoudias, C. Mario and Urtasun, Raquel and Darrell, Trevor},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {88–96},
title = {Multi-view learning in the presence of view disagreement},
year = {2008}
}

@inproceedings{11,
abstract = {We address the problem of computing approximate marginals in Gaussian probabilistic models by using mean field and fractional Bethe approximations. As an extension of Welling and Teh (2001), we define the Gaussian fractional Bethe free energy in terms of the moment parameters of the approximate marginals and derive an upper and lower bound for it. We give necessary conditions for the Gaussian fractional Bethe free energies to be bounded from below. It turns out that the bounding condition is the same as the pairwise normalizability condition derived by Malioutov et al. (2006) as a sufficient condition for the convergence of the message passing algorithm. By giving a counterexample, we disprove the conjecture in Welling and Teh (2001): even when the Bethe free energy is not bounded from below, it can possess a local minimum to which the minimization algorithms can converge.},
author = {Cseke, Botond and Heskes, Tom},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {97–104},
title = {Bounds on the Bethe free energy for Gaussian networks},
year = {2008}
}

@inproceedings{12,
abstract = {The problem of learning discrete Bayesian networks from data is encoded as a weighted MAX-SAT problem and the MaxWalkSat local search algorithm is used to address it. For each dataset, the per-variable summands of the (BDeu) marginal likelihood for different choices of parents ('family scores') are computed prior to applying MaxWalkSat. Each permissible choice of parents for each variable is encoded as a distinct propositional atom and the associated family score encoded as a 'soft' weighted single-literal clause. Two approaches to enforcing acyclicity are considered: either by encoding the ancestor relation or by attaching a total order to each graph and encoding that. The latter approach gives better results. Learning experiments have been conducted on 21 synthetic datasets sampled from 7 BNs. The largest dataset has 10,000 datapoints and 60 variables producing (for the 'ancestor' encoding) a weighted CNF input file with 19,932 atoms and 269,367 clauses. For most datasets, MaxWalkSat quickly finds BNs with higher BDeu score than the 'true' BN. The effect of adding prior information is assessed. It is further shown that Bayesian model averaging can be effected by collecting BNs generated during the search.},
author = {Cussens, James},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {105–112},
title = {Bayesian network learning by compiling to weighted MAX-SAT},
year = {2008}
}

@inproceedings{13,
abstract = {We consider conditions that allow us to find an optimal strategy for sequential decisions from a given data situation. For the case where all interventions are unconditional (atomic), identifiability has been discussed by Pearl \&amp; Robins (1995). We argue here that an optimal strategy must be conditional, i.e. take the information available at each decision point into account. We show that the identification of an optimal sequential decision strategy is more restrictive, in the sense that conditional interventions might not always be identified when atomic interventions are. We further demonstrate that a simple graphical criterion for the identifiability of an optimal strategy can be given.},
author = {Dawid, A. Philip and Didelez, Vanessa},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {113–120},
title = {Identifying optimal sequential decisions},
year = {2008}
}

@inproceedings{14,
abstract = {This paper describes a new algorithm to solve the decision making problem in Influence Diagrams based on algorithms for credal networks. Decision nodes are associated to imprecise probability distributions and a reformulation is introduced that finds the global maximum strategy with respect to the expected utility. We work with Limited Memory Influence Diagrams, which generalize most Influence Diagram proposals and handle simultaneous decisions. Besides the global optimum method, we explore an anytime approximate solution with a guaranteed maximum error and show that imprecise probabilities are handled in a straightforward way. Complexity issues and experiments with random diagrams and an effects-based military planning problem are discussed.},
author = {de Campos, Cassio P. and Ji, Qiang},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {121–128},
title = {Strategy selection in influence diagrams using imprecise probabilities},
year = {2008}
}

@inproceedings{15,
abstract = {When the initial and transition probabilities of a finite Markov chain in discrete time are not well known, we should perform a sensitivity analysis. This is done by considering as basic uncertainty models the so-called credal sets that these probabilities are known or believed to belong to, and by allowing the probabilities to vary over such sets. This leads to the definition of an imprecise Markov chain. We show that the time evolution of such a system can be studied very efficiently using so-called lower and upper expectations. We also study how the inferred credal set about the state at time n evolves as n → ∞: under quite unrestrictive conditions, it converges to a uniquely invariant credal set, regardless of the credal set given for the initial state. This leads to a non-trivial generalisation of the classical Perron-Frobenius Theorem to imprecise Markov chains.},
author = {de Cooman, Gert and Hermans, Filip and Quaeghebeur, Erik},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {129–136},
title = {Sensitivity analysis for finite Markov chains in discrete time},
year = {2008}
}

@inproceedings{16,
abstract = {Graphical models trained using maximum likelihood are a common tool for probabilistic inference of marginal distributions. However, this approach suffers difficulties when either the inference process or the model is approximate. In this paper, the inference process is first defined to be the minimization of a convex function, inspired by free energy approximations. Learning is then done directly in terms of the performance of the inference process at univariate marginal prediction. The main novelty is that this is a direct minimization of empirical risk, where the risk measures the accuracy of predicted marginals.},
author = {Domke, Justin},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {137–144},
title = {Learning convex inference of marginals},
year = {2008}
}

@inproceedings{17,
abstract = {A graphical multiagent model (GMM) represents a joint distribution over the behavior of a set of agents. One source of knowledge about agents' behavior may come from game-theoretic analysis, as captured by several graphical game representations developed in recent years. GMMs generalize this approach to express arbitrary distributions, based on game descriptions or other sources of knowledge bearing on beliefs about agent behavior. To illustrate the flexibility of GMMs, we exhibit game-derived models that allow probabilistic deviation from equilibrium, as well as models based on heuristic action choice. We investigate three different methods of integrating these models into a single model representing the combined knowledge sources. To evaluate the predictive performance of the combined model, we treat as actual outcome the behavior produced by a reinforcement learning process. We find that combining the two knowledge sources, using any of the methods, provides better predictions than either source alone. Among the combination methods, mixing data outperforms the opinion pool and direct update methods investigated in this empirical trial.},
author = {Duong, Quang and Wellman, Michael P. and Singh, Satinder},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {145–152},
title = {Knowledge combination in graphical multiagent models},
year = {2008}
}

@inproceedings{18,
abstract = {Gaussian Markov random fields (GMRFs) are useful in a broad range of applications. In this paper we tackle the problem of learning a sparse GMRF in a high-dimensional space. Our approach uses the ℓ1-norm as a regularization on the inverse covariance matrix. We utilize a novel projected gradient method, which is faster than previous methods in practice and equal to the best performing of these in asymptotic complexity. We also extend the ℓ1-regularized objective to the problem of sparsifying entire blocks within the inverse covariance matrix. Our methods generalize fairly easily to this case, while other methods do not. We demonstrate that our extensions give better generalization performance on two real domains—biological network analysis and a 2D-shape modeling image task.},
author = {Duchi, John and Gould, Stephen and Koller, Daphne},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {153–160},
title = {Projected subgradient methods for learning sparse Gaussians},
year = {2008}
}

@inproceedings{19,
abstract = {We conjecture that the worst case number of experiments necessary and sufficient to discover a causal graph uniquely given its observational Markov equivalence class can be specified as a function of the largest clique in the Markov equivalence class. We provide an algorithm that computes intervention sets that we believe are optimal for the above task. The algorithm builds on insights gained from the worst case analysis in Eberhardt et al. (2005) for sequences of experiments when all possible directed acyclic graphs over N variables are considered. A simulation suggests that our conjecture is correct. We also show that a generalization of our conjecture to other classes of possible graph hypotheses cannot be given easily, and in what sense the algorithm is then no longer optimal.},
author = {Eberhardt, Frederick},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {161–168},
title = {Almost optimal intervention sets for causal discovery},
year = {2008}
}

@inproceedings{2,
abstract = {Chordal graphs can be used to encode dependency models that are representable by both directed acyclic and undirected graphs. This paper discusses a very simple and efficient algorithm to learn the chordal structure of a probabilistic model from data. The algorithm is a greedy hill-climbing search algorithm that uses the inclusion boundary neighborhood over chordal graphs. In the limit of a large sample size and under appropriate hypotheses on the scoring criterion, we prove that the algorithm will find a structure that is inclusion-optimal when the dependency model of the data-generating distribution can be represented exactly by an undirected graph. The algorithm is evaluated on simulated datasets.},
author = {Auvray, Vincent and Wehenkel, Louis},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {18–25},
title = {Learning inclusion-optimal chordal graphs},
year = {2008}
}

@inproceedings{20,
abstract = {A central task in many applications is reasoning about processes that change over continuous time. Continuous-Time Bayesian Networks is a general compact representation language for multi-component continuous-time processes. However, exact inference in such processes is exponential in the number of components, and thus infeasible for most models of interest. Here we develop a novel Gibbs sampling procedure for multi-component processes. This procedure iteratively samples a trajectory for one of the components given the remaining ones. We show how to perform exact sampling that adapts to the natural time scale of the sampled process. Moreover, we show that this sampling procedure naturally exploits the structure of the network to reduce the computational cost of each step. This procedure is the first that can provide asymptotically unbiased approximation in such processes.},
author = {El-Hay, Tal and Friedman, Nir and Kupferman, Raz},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {169–178},
title = {Gibbs sampling in factorized continuous-time Markov processes},
year = {2008}
}

@inproceedings{21,
abstract = {When related learning tasks are naturally arranged in a hierarchy, an appealing approach for coping with scarcity of instances is that of transfer learning using a hierarchical Bayes framework. As fully Bayesian computations can be difficult and computationally demanding, it is often desirable to use posterior point estimates that facilitate (relatively) efficient prediction. However, the hierarchical Bayes framework does not always lend itself naturally to this maximum a-posteriori goal. In this work we propose an undirected reformulation of hierarchical Bayes that relies on priors in the form of similarity measures. We introduce the notion of "degree of transfer" weights on components of these similarity measures, and show how they can be automatically learned within a joint probabilistic framework. Importantly, our reformulation results in a convex objective for many learning problems, thus facilitating optimal posterior point estimation using standard optimization techniques. In addition, we no longer require proper priors, allowing for flexible and straightforward specification of joint distributions over transfer hierarchies. We show that our framework is effective for learning models that are part of transfer hierarchies for two real-life tasks: object shape modeling using Gaussian density estimation and document classification.},
author = {Elidan, Gal and Packer, Ben and Heitz, Geremy and Koller, Daphne},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {179–187},
title = {Convex point estimation using undirected Bayesian transfer hierarchies},
year = {2008}
}

@inproceedings{22,
abstract = {In addressing the challenge of exponential scaling with the number of agents we adopt a cluster-based representation to approximately solve asymmetric games of very many players. A cluster groups together agents with a similar "strategic view" of the game. We learn the clustered approximation from data consisting of strategy profiles and payoffs, which may be obtained from observations of play or access to a simulator. Using our clustering we construct a reduced "twins" game in which each cluster is associated with two players of the reduced game. This allows our representation to be individually-responsive because we align the interests of every individual agent with the strategy of its cluster. Our approach provides agents with higher payoffs and lower regret on average than model-free methods as well as previous cluster-based methods, and requires only few observations for learning to be successful. The "twins" approach is shown to be an important component of providing these low regret approximations.},
author = {Ficici, Sevan G. and Parkes, David C. and Pfeffer, Avi},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {188–195},
title = {Learning and solving many-player games through a cluster-based representation},
year = {2008}
}

@inproceedings{23,
abstract = {Parameter estimation in Markov random fields (MRFs) is a difficult task, in which inference over the network is run in the inner loop of a gradient descent procedure. Replacing exact inference with approximate methods such as loopy belief propagation (LBP) can suffer from poor convergence. In this paper, we provide a different approach for combining MRF learning and Bethe approximation. We consider the dual of maximum likelihood Markov network learning — maximizing entropy with moment matching constraints — and then approximate both the objective and the constraints in the resulting optimization problem. Unlike previous work along these lines (Teh \&amp; Welling, 2003), our formulation allows parameter sharing between features in a general log-linear model, parameter regularization and conditional training. We show that piecewise training (Sutton \&amp; McCallum, 2005) is a very restricted special case of this formulation. We study two optimization strategies: one based on a single convex approximation and one that uses repeated convex approximations. We show results on several real-world networks that demonstrate that these algorithms can significantly outperform learning with loopy and piece-wise. Our results also provide a framework for analyzing the trade-offs of different relaxations of the entropy objective and of the constraints.},
author = {Ganapathi, Varun and Vickrey, David and Duchi, John and Koller, Daphne},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {196–203},
title = {Constrained approximate maximum entropy learning of Markov random fields},
year = {2008}
}

@inproceedings{24,
abstract = {In many machine learning problems, labeled training data is limited but unlabeled data is ample. Some of these problems have instances that can be factored into multiple views, each of which is nearly sufficent in determining the correct labels. In this paper we present a new algorithm for probabilistic multi-view learning which uses the idea of stochastic agreement between views as regularization. Our algorithm works on structured and unstructured problems and easily generalizes to partial agreement scenarios. For the full agreement case, our algorithm minimizes the Bhattacharyya distance between the models of each view, and performs better than CoBoosting and two-view Perceptron on several flat and structured classification problems.},
author = {Ganchev, Kuzman and Gra\c{c}a, Jo\~{a}o V. and Blitzer, John and Taskar, Ben},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {204–211},
title = {Multi-view learning over structured and non-identical outputs},
year = {2008}
}

@inproceedings{25,
abstract = {The paper introduces AND/OR importance sampling for probabilistic graphical models. In contrast to importance sampling, AND/OR importance sampling caches samples in the AND/OR space and then extracts a new sample mean from the stored samples. We prove that AND/OR importance sampling may have lower variance than importance sampling; thereby providing a theoretical justification for preferring it over importance sampling. Our empirical evaluation demonstrates that AND/OR importance sampling is far more accurate than importance sampling in many cases.},
author = {Gogate, Vibhav and Dechter, Rina},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {212–219},
title = {AND/OR importance sampling},
year = {2008}
}

@inproceedings{26,
abstract = {Formal languages for probabilistic modeling enable re-use, modularity, and descriptive clarity, and can foster generic inference techniques. We introduce Church, a universal language for describing stochastic generative processes. Church is based on the Lisp model of lambda calculus, containing a pure Lisp as its deterministic subset. The semantics of Church is defined in terms of evaluation histories and conditional distributions on such histories. Church also includes a novel language construct, the stochastic memoizer, which enables simple description of many complex non-parametric models. We illustrate language features through several examples, including: a generalized Bayes net in which parameters cluster over trials, infinite PCFGs, planning by inference, and various non-parametric clustering models. Finally, we show how to implement query on any Church program, exactly and approximately, using Monte Carlo techniques.},
author = {Goodman, Noah D. and Mansinghka, Vikash K. and Roy, Daniel and Bonawitz, Keith and Tenenbaum, Joshua B.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {220–229},
title = {Church: a language for generative models},
year = {2008}
}

@inproceedings{27,
abstract = {Latent topic models have been successfully applied as an unsupervised topic discovery technique in large document collections. With the proliferation of hypertext document collection such as the Internet, there has also been great interest in extending these approaches to hypertext [6, 9]. These approaches typically model links in an analogous fashion to how they model words - the document-link co-occurrence matrix is modeled in the same way that the document-word co-occurrence matrix is modeled in standard topic models.In this paper we present a probabilistic generative model for hypertext document collections that explicitly models the generation of links. Specifically, links from a word w to a document d depend directly on how frequent the topic of w is in d, in addition to the in-degree of d. We show how to perform EM learning on this model efficiently. By not modeling links as analogous to words, we end up using far fewer free parameters and obtain better link prediction results.},
author = {Gruber, Amit and Rosen-Zvi, Michal and Weiss, Yair},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {230–239},
title = {Latent topic models for hypertext},
year = {2008}
}

@inproceedings{28,
abstract = {We consider how an agent should update her uncertainty when it is represented by a set P of probability distributions and the agent observes that a random variable X takes on value x, given that the agent makes decisions using the minimax criterion, perhaps the best-studied and most commonly-used criterion in the literature. We adopt a game-theoretic framework, where the agent plays against a bookie, who chooses some distribution from P. We consider two reasonable games that differ in what the bookie knows when he makes his choice. Anomalies that have been observed before, like time inconsistency, can be understood as arising because different games are being played, against bookies with different information. We characterize the important special cases in which the optimal decision rules according to the minimax criterion amount to either conditioning or simply ignoring the information. Finally, we consider the relationship between conditioning and calibration when uncertainty is described by sets of probabilities.},
author = {Gr\"{u}nwald, Peter D. and Halpern, Joseph Y.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {240–247},
title = {A game-theoretic analysis of updating sets of probabilities},
year = {2008}
}

@inproceedings{29,
abstract = {Approximate inference in dynamic systems is the problem of estimating the state of the system given a sequence of actions and partial observations. High precision estimation is fundamental in many applications like diagnosis, natural language processing, tracking, planning, and robotics. In this paper we present an algorithm that samples possible deterministic executions of a probabilistic sequence. The algorithm takes advantage of a compact representation (using first order logic) for actions and world states to improve the precision of its estimation. Theoretical and empirical results show that the algorithm's expected error is smaller than propositional sampling and Sequential Monte Carlo (SMC) sampling techniques.},
author = {Hajishirzi, Hannaneh and Amir, Eyal},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {248–255},
title = {Sampling first order logical particles},
year = {2008}
}

@inproceedings{3,
abstract = {We introduce Clique Matrices as an alternative representation of undirected graphs, being a generalisation of the incidence matrix representation. Here we use clique matrices to decompose a graph into a set of possibly overlapping clusters, defined as well-connected subsets of vertices. The decomposition is based on a statistical description which encourages clusters to be well connected and few in number. Inference is carried out using a variational approximation. Clique matrices also play a natural role in parameterising positive definite matrices under zero constraints on elements of the matrix. We show that clique matrices can parameterise all positive definite matrices restricted according to a decomposable graph and form a structured Factor Analysis approximation in the non-decomposable case.},
author = {Barber, David},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {26–33},
title = {Clique matrices for statistical graph decomposition and parameterising restricted positive definite matrices},
year = {2008}
}

@inproceedings{30,
abstract = {Bounded policy iteration is an approach to solving infinite-horizon POMDPs that represents policies as stochastic finite-state controllers and iteratively improves a controller by adjusting the parameters of each node using linear programming. In the original algorithm, the size of the linear programs, and thus the complexity of policy improvement, depends on the number of parameters of each node, which grows with the size of the controller. But in practice, the number of parameters of a node with non-zero values is often very small, and does not grow with the size of the controller. Based on this observation, we develop a version of bounded policy iteration that leverages the sparse structure of a stochastic finite-state controller. In each iteration, it improves a policy by the same amount as the original algorithm, but with much better scalability.},
author = {Hansen, Eric A.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {256–263},
title = {Sparse stochastic finite-state controllers for POMDPs},
year = {2008}
}

@inproceedings{31,
abstract = {Inference problems in graphical models can be represented as a constrained optimization of a free energy function. It is known that when the Bethe free energy is used, the fixed-points of the belief propagation (BP) algorithm correspond to the local minima of the free energy. However BP fails to converge in many cases of interest. Moreover, the Bethe free energy is non-convex for graphical models with cycles thus introducing great difficulty in deriving efficient algorithms for finding local minima of the free energy for general graphs. In this paper we introduce two efficient BP-like algorithms, one sequential and the other parallel, that are guaranteed to converge to the global minimum, for any graph, over the class of energies known as "convex free energies". In addition, we propose an efficient heuristic for setting the parameters of the convex free energy based on the structure of the graph.},
author = {Hazan, Tamir and Shashua, Amnon},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {264–273},
title = {Convergent message-passing algorithms for inference over general graphs with convex free energies},
year = {2008}
}

@inproceedings{32,
abstract = {We study a multiagent learning problem where agents can either learn via repeated interactions, or can follow the advice of a mediator who suggests possible actions to take. We present an algorithm that each agent can use so that, with high probability, they can verify whether or not the mediator's advice is useful. In particular, if the mediator's advice is useful then agents will reach a correlated equilibrium, but if the mediator's advice is not useful, then agents are not harmed by using our test, and can fall back to their original learning algorithm. We then generalize our algorithm and show that in the limit it always correctly verifies the mediator's advice.},
author = {Hines, Greg and Larson, Kate},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {274–281},
title = {Learning when to take advice: a statistical test for achieving a correlated equilibrium},
year = {2008}
}

@inproceedings{33,
abstract = {An important task in data analysis is the discovery of causal relationships between observed variables. For continuous-valued data, linear acyclic causal models are commonly used to model the data-generating process, and the inference of such models is a well-studied problem. However, existing methods have significant limitations. Methods based on conditional independencies (Spirtes et al. 1993; Pearl 2000) cannot distinguish between independence-equivalent models, whereas approaches purely based on Independent Component Analysis (Shimizu et al. 2006) are inapplicable to data which is partially Gaussian. In this paper, we generalize and combine the two approaches, to yield a method able to learn the model structure in many cases for which the previous methods provide answers that are either incorrect or are not as informative as possible. We give exact graphical conditions for when two distinct models represent the same family of distributions, and empirically demonstrate the power of our method through thorough simulations.},
author = {Hoyer, Patrik O. and Hyv\"{a}rinen, Aapo and Scheines, Richard and Spirtes, Peter and Ramsey, Joseph and Lacerda, Gustavo and Shimizu, Shohei},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {282–289},
title = {Causal discovery of linear acyclic models with arbitrary distributions},
year = {2008}
}

@inproceedings{34,
abstract = {We introduce a new type of graphical model called a 'cumulative distribution network' (CDN), which expresses a joint cumulative distribution as a product of local functions. Each local function can be viewed as providing evidence about possible orderings, or rankings, of variables. Interestingly, we find that the conditional independence properties of CDNs are quite different from other graphical models. We also describe a message-passing algorithm that efficiently computes conditional cumulative distributions. Due to the unique independence properties of the CDN, these messages do not in general have a one-to-one correspondence with messages exchanged in standard algorithms, such as belief propagation. We demonstrate the application of CDNs for structured ranking learning using a previously-studied multi-player gaming dataset.},
author = {Huang, Jim C. and Frey, Brendan J.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {290–297},
title = {Cumulative distribution networks and the derivative-sum-product algorithm},
year = {2008}
}

@inproceedings{35,
abstract = {User preferences for automated assistance often vary widely, depending on the situation, and quality or presentation of help. Developing effective models to learn individual preferences online requires domain models that associate observations of user behavior with their utility functions, which in turn can be constructed using utility elicitation techniques. However, most elicitation methods ask for users' predicted utilities based on hypothetical scenarios rather than more realistic experienced utilities. This is especially true in interface customization, where users are asked to assess novel interface designs. We propose experiential utility elicitation methods for customization and compare these to predictive methods. As experienced utilities have been argued to better reflect true preferences in behavioral decision making, the purpose here is to investigate accurate and efficient procedures that are suitable for software domains. Unlike conventional elicitation, our results indicate that an experiential approach helps people understand stochastic outcomes, as well as better appreciate the sequential utility of intelligent assistance.},
author = {Hui, Bowen and Boutilier, Craig},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {298–305},
title = {Toward experiential utility elicitation for interface customization},
year = {2008}
}

@inproceedings{36,
abstract = {In this paper, we consider planning in stochastic shortest path (SSP) problems, a subclass of Markov Decision Problems (MDP). We focus on medium-size problems whose state space can be fully enumerated. This problem has numerous important applications, such as navigation and planning under uncertainty. We propose a new approach for constructing a multi-level hierarchy of progressively simpler abstractions of the original problem. Once computed, the hierarchy can be used to speed up planning by first finding a policy for the most abstract level and then recursively refining it into a solution to the original problem. This approach is fully automated and delivers a speed-up of two orders of magnitude over a state-of-the-art MDP solver on sample problems while returning near-optimal solutions. We also prove theoretical bounds on the loss of solution optimality resulting from the use of abstractions.},
author = {Isaza, Alejandro and Szepesv\'{a}ri, Csaba and Bulitko, Vadim and Greiner, Russell},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {306–314},
title = {Speeding up planning in Markov decision processes via automatically constructed abstractions},
year = {2008}
}

@inproceedings{37,
abstract = {A Bayesian treatment of latent directed graph structure for non-iid data is provided where each child datum is sampled with a directed conditional dependence on a single unknown parent datum. The latent graph structure is assumed to lie in the family of directed out-tree graphs which leads to efficient Bayesian inference. The latent likelihood of the data and its gradients are computable in closed form via Tutte's directed matrix tree theorem using determinants and inverses of the out-Laplacian. This novel likelihood subsumes iid likelihood, is exchangeable and yields efficient unsupervised and semi-supervised learning algorithms. In addition to handling taxonomy and phylogenetic datasets the out-tree assumption performs surprisingly well as a semi-parametric density estimator on standard iid datasets. Experiments with unsupervised and semi-supervised learning are shown on various UCI and taxonomy datasets.},
author = {Jebara, Tony},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {315–324},
title = {Bayesian out-trees},
year = {2008}
}

@inproceedings{38,
abstract = {Identifying co-varying causal elements in very high dimensional feature space with internal structures, e.g., a space with as many as millions of linearly ordered features, as one typically encounters in problems such as whole genome association (WGA) mapping, remains an open problem in statistical learning. We propose a block-regularized regression model for sparse variable selection in a high-dimensional space where the covariates are linearly ordered, and are possibly subject to local statistical linkages (e.g., block structures) due to spacial or temporal proximity of the features. Our goal is to identify a small subset of relevant covariates that are not merely from random positions in the ordering, but grouped as contiguous blocks from large number of ordered covariates. Following a typical linear regression framework between the features and the response, our proposed model employs a sparsity-enforcing Laplacian prior for the regression coefficients, augmented by a 1st-order Markovian process along the feature sequence that "activates" the regression coefficients in a coupled fashion. We describe a sampling-based learning algorithm and demonstrate the performance of our method on simulated and biological data for marker identification under WGA.},
author = {Kim, Seyoung and Xing, Eric},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {325–332},
title = {Feature selection via block-regularized regression},
year = {2008}
}

@inproceedings{39,
abstract = {This paper deals with the problem of evaluating the causal effect using observational data in the presence of an unobserved exposure/outcome variable, when cause-effect relationships between variables can be described as a directed acyclic graph and the corresponding recursive factorization of a joint distribution. First, we propose identifiability criteria for causal effects when an unobserved exposure/outcome variable is considered to contain more than two categories. Next, when unmeasured variables exist between an unobserved outcome variable and its proxy variables, we provide the tightest bounds based on the potential outcome approach. The results of this paper are helpful to evaluate causal effects in the case where it is difficult or expensive to observe an exposure/outcome variable in many practical fields.},
author = {Kuroki, Manabu and Cai, Zhihong},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {333–340},
title = {The evaluation of causal effects in studies with an unobserved exposure/outcome variable: bounds and identification},
year = {2008}
}

@inproceedings{4,
abstract = {Decision circuits have been developed to perform efficient evaluation of influence diagrams [Bhattacharjya and Shachter, 2007], building on the advances in arithmetic circuits for belief network inference [Darwiche, 2003]. In the process of model building and analysis, we perform sensitivity analysis to understand how the optimal solution changes in response to changes in the model. When sequential decision problems under uncertainty are represented as decision circuits, we can exploit the efficient solution process embodied in the decision circuit and the wealth of derivative information available to compute the value of information for the uncertainties in the problem and the effects of changes to model parameters on the value and the optimal strategy.},
author = {Bhattacharjya, Debarun and Shachter, Ross D.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {34–42},
title = {Sensitivity analysis in decision circuits},
year = {2008}
}

@inproceedings{40,
abstract = {Approximate linear programming (ALP) is an efficient approach to solving large factored Markov decision processes (MDPs). The main idea of the method is to approximate the optimal value function by a set of basis functions and optimize their weights by linear programming (LP). This paper proposes a new ALP approximation. Comparing to the standard ALP formulation, we decompose the constraint space into a set of low-dimensional spaces. This structure allows for solving the new LP efficiently. In particular, the constraints of the LP can be satisfied in a compact form without an exponential dependence on the treewidth of ALP constraints. We study both practical and theoretical aspects of the proposed approach. Moreover, we demonstrate its scale-up potential on an MDP with more than 2100 states.},
author = {Kveton, Branislav and Hauskrecht, Milos},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {341–348},
title = {Partitioned linear programming approximations for MDPs},
year = {2008}
}

@inproceedings{41,
abstract = {While known algorithms for sensitivity analysis and parameter tuning in probabilistic networks have a running time that is exponential in the size of the network, the exact computational complexity of these problems has not been established as yet. In this paper we study several variants of the tuning problem and show that these problems are NPPP-complete in general. We further show that the problems remain NP-complete or PP-complete, for a number of restricted variants. These complexity results provide insight in whether or not recent achievements in sensitivity analysis and tuning can be extended to more general, practicable methods.},
author = {Kwisthout, Johan and van der Gaag, Linda C.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {349–356},
title = {The computational complexity of sensitivity analysis and parameter tuning},
year = {2008}
}

@inproceedings{42,
abstract = {Confidence measures for the generalization error are crucial when small training samples are used to construct classifiers. A common approach is to estimate the generalization error by resampling and then assume the re-sampled estimator follows a known distribution to form a confidence set [Kohavi 1995, Martin 1996, Yang 2006]. Alternatively, one might bootstrap the resampled estimator of the generalization error to form a confidence set. Unfortunately, these methods do not reliably provide sets of the desired confidence. The poor performance appears to be due to the lack of smoothness of the generalization error as a function of the learned classifier. This results in a non-normal distribution of the estimated generalization error. We construct a confidence set for the generalization error by use of a smooth upper bound on the deviation between the resampled estimate and generalization error. The confidence set is formed by bootstrapping this upper bound. In cases in which the approximation class for the classifier can be represented as a parametric additive model, we provide a computationally efficient algorithm. This method exhibits superior performance across a series of test and simulated data sets.},
author = {Laber, Eric B. and Murphy, Susan A.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {357–365},
title = {Small sample inference for generalization error in classification using the CUD bound},
year = {2008}
}

@inproceedings{43,
abstract = {We generalize Shimizu et al's (2006) ICA-based approach for discovering linear non-Gaussian acyclic (LiNGAM) Structural Equation Models (SEMs) from causally sufficient, continuous-valued observational data. By relaxing the assumption that the generating SEM's graph is acyclic, we solve the more general problem of linear non-Gaussian (LiNG) SEM discovery. LiNG discovery algorithms output the distribution equivalence class of SEMs which, in the large sample limit, represents the population distribution. We apply a LiNG discovery algorithm to simulated data. Finally, we give sufficient conditions under which only one of the SEMs in the output class is "stable".},
author = {Lacerda, Gustavo and Spirtes, Peter and Ramsey, Joseph and Hoyer, Patrik O.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {366–374},
title = {Discovering cyclic causal models by independent components analysis},
year = {2008}
}

@inproceedings{44,
abstract = {An efficient policy search algorithm should estimate the local gradient of the objective function, with respect to the policy parameters, from as few trials as possible. Whereas most policy search methods estimate this gradient by observing the rewards obtained during policy trials, we show, both theoretically and empirically, that taking into account the sensor data as well gives better gradient estimates and hence faster learning. The reason is that rewards obtained during policy execution vary from trial to trial due to noise in the environment; sensor data, which correlates with the noise, can be used to partially correct for this variation, resulting in an estimator with lower variance.},
author = {Lawrence, Gregory and Russell, Stuart},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {375–382},
title = {Improving gradient estimation by incorporating sensor data},
year = {2008}
}

@inproceedings{45,
abstract = {Graphical models are usually learned without regard to the cost of doing inference with them. As a result, even if a good model is learned, it may perform poorly at prediction, because it requires approximate inference. We propose an alternative: learning models with a score function that directly penalizes the cost of inference. Specifically, we learn arithmetic circuits with a penalty on the number of edges in the circuit (in which the cost of inference is linear). Our algorithm is equivalent to learning a Bayesian network with context-specific independence by greedily splitting conditional distributions, at each step scoring the candidates by compiling the resulting network into an arithmetic circuit, and using its size as the penalty. We show how this can be done efficiently, without compiling a circuit from scratch for each candidate. Experiments on several real-world domains show that our algorithm is able to learn tractable models with very large treewidth, and yields more accurate predictions than a standard context-specific Bayesian network learner, in far less time.},
author = {Lowd, Daniel and Domingos, Pedro},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {383–392},
title = {Learning arithmetic circuits},
year = {2008}
}

@inproceedings{46,
abstract = {This paper presents a natural extension of stagewise ranking to the the case of infinitely many items. We introduce the infinite generalized Mallows model (IGM), describe its properties and give procedures to estimate it from data. For estimation of multimodal distributions we introduce the Exponential-Blurring-Mean-Shift nonparametric clustering algorithm. The experiments highlight the properties of the new model and demonstrate that infinite models can be simple, elegant and practical.},
author = {Meil\u{a}, Marina and Bao, Le},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {393–402},
title = {Estimation and clustering with infinite rankings},
year = {2008}
}

@inproceedings{47,
abstract = {Nonparametric Bayesian models are often based on the assumption that the objects being modeled are exchangeable. While appropriate in some applications (e.g., bag-of-words models for documents), exchangeability is sometimes assumed simply for computational reasons; non-exchangeable models might be a better choice for applications based on subject matter. Drawing on ideas from graphical models and phylogenetics, we describe a non-exchangeable prior for a class of nonparametric latent feature models that is nearly as efficient computationally as its exchangeable counterpart. Our model is applicable to the general setting in which the dependencies between objects can be expressed using a tree, where edge lengths indicate the strength of relationships. We demonstrate an application to modeling probabilistic choice.},
author = {Miller, Kurt T. and Griffiths, Thomas L. and Jordan, Michael I.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {403–410},
title = {The phylogenetic Indian Buffet process: a non-exchangeable nonparametric prior for latent features},
year = {2008}
}

@inproceedings{48,
abstract = {Although fully generative models have been successfully used to model the contents of text documents, they are often awkward to apply to combinations of text data and document metadata. In this paper we propose a Dirichlet-multinomial regression (DMR) topic model that includes a log-linear prior on document-topic distributions that is a function of observed features of the document, such as author, publication venue, references, and dates. We show that by selecting appropriate features, DMR topic models can meet or exceed the performance of several previously published topic models designed for specific data.},
author = {Mimno, David and McCallum, Andrew},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {411–418},
title = {Topic models conditioned on arbitrary features with Dirichlet-multinomial regression},
year = {2008}
}

@inproceedings{49,
abstract = {We present a polynomial-time algorithm that always finds an (approximate) Nash equilibrium for repeated two-player stochastic games. The algorithm exploits the folk theorem to derive a strategy profile that forms an equilibrium by buttressing mutually beneficial behavior with threats, where possible. One component of our algorithm efficiently searches for an approximation of the egalitarian point, the fairest pareto-efficient solution. The paper concludes by applying the algorithm to a set of grid games to illustrate typical solutions the algorithm finds. These solutions compare very favorably to those found by competing algorithms, resulting in strategies with higher social welfare, as well as guaranteed computational efficiency.},
author = {de Cote, Enrique Munoz and Littman, Michael L.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {419–426},
title = {A polynomial-time Nash equilibrium algorithm for repeated stochastic games},
year = {2008}
}

@inproceedings{5,
abstract = {We propose a variable decomposition algorithm-greedy block coordinate descent (GBCD)-in order to make dense Gaussian process regression practical for large scale problems. GBCD breaks a large scale optimization into a series of small sub-problems. The challenge in variable decomposition algorithms is the identification of a sub-problem (the active set of variables) that yields the largest improvement. We analyze the limitations of existing methods and cast the active set selection into a zero-norm constrained optimization problem that we solve using greedy methods. By directly estimating the decrease in the objective function, we obtain not only efficient approximate solutions for GBCD, but we are also able to demonstrate that the method is globally convergent. Empirical comparisons against competing dense methods like Conjugate Gradient or SMO show that GBCD is an order of magnitude faster. Comparisons against sparse GP methods show that GBCD is both accurate and capable of handling datasets of 100,000 samples or more.},
author = {Bo, Liefeng and Sminchisescu, Cristian},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {43–52},
title = {Greedy block coordinate descent for large scale Gaussian process regression},
year = {2008}
}

@inproceedings{50,
abstract = {Bayesian networks can be used to extract explanations about the observed state of a subset of variables. In this paper, we explicate the desiderata of an explanation and confront them with the concept of explanation proposed by existing methods. The necessity of taking into account causal approaches when sal graph is available is discussed. We then introduce causal explanation trees, based on the construction of explanation trees using the measure of causal information flow (Ay and Polani, 2006). This approach is compared to several other methods on known networks.},
author = {Nielsen, Ulf H. and Pellet, Jean-Philippe and Elisseeff, Andr\'{e}},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {427–434},
title = {Explanation trees for causal Bayesian networks},
year = {2008}
}

@inproceedings{51,
abstract = {A lattice-theoretic framework is introduced that permits the study of the conditional independence (CI) implication problem relative to the class of discrete probability measures. Semi-lattices are associated with CI statements and a finite, sound and complete inference system relative to semi-lattice inclusions is presented. This system is shown to be (1) sound and complete for saturated CI statements, (2) complete for general CI statements, and (3) sound and complete for stable CI statements. These results yield a criterion that can be used to falsify instances of the implication problem and several heuristics are derived that approximate this "lattice-exclusion" criterion in polynomial time. Finally, we provide experimental results that relate our work to results obtained from other existing inference algorithms.},
author = {Niepert, Mathias and Van Gucht, Dirk and Gyssens, Marc},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {435–443},
title = {On the conditional independence implication problem: a lattice-theoretic approach},
year = {2008}
}

@inproceedings{52,
abstract = {We consider the task of learning mappings from sequential data to real-valued responses. We present and evaluate an approach to learning a type of hidden Markov model (HMM) for regression. The learning process involves inferring the structure and parameters of a conventional HMM, while simultaneously learning a regression model that maps features that characterize paths through the model to continuous responses. Our results, in both synthetic and biological domains, demonstrate the value of jointly learning the two components of our approach.},
author = {Noto, Keith and Craven, Mark},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {444–451},
title = {Learning hidden Markov models for regression using path aggregation},
year = {2008}
}

@inproceedings{53,
abstract = {This paper develops a measure for bounding the performance of AND/OR search algorithms for solving a variety of queries over graphical models. We show how drawing a connection to the recent notion of hypertree decompositions allows to exploit determinism in the problem specification and produce tighter bounds. We demonstrate on a variety of practical problem instances that we are often able to improve upon existing bounds by several orders of magnitude.},
author = {Otten, Lars and Dechter, Rina},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {452–459},
title = {Bounding search space size via (Hyper)tree decompositions},
year = {2008}
}

@inproceedings{54,
abstract = {Deciding what to sense is a crucial task, made harder by dependencies and by a non-additive utility function. We develop approximation algorithms for selecting an optimal set of measurements, under a dependency structure modeled by a tree-shaped Bayesian network (BN).Our approach is a generalization of composing anytime algorithm represented by conditional performance profiles. This is done by relaxing the input monotonicity assumption, and extending the local compilation technique to more general classes of performance profiles (PPs). We apply the extended scheme to selecting a subset of measurements for choosing a maximum expectation variable in a binary valued BN, and for minimizing the worst variance in a Gaussian BN.},
author = {Radovilsky, Yan and Shimony, Solomon Eyal},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {460–467},
title = {Observation subset selection as local compilation of performance profiles},
year = {2008}
}

@inproceedings{55,
abstract = {In this work we present Cutting Plane Inference (CPI), a Maximum A Posteriori (MAP) inference method for Statistical Relational Learning. Framed in terms of Markov Logic and inspired by the Cutting Plane Method, it can be seen as a meta algorithm that instantiates small parts of a large and complex Markov Network and then solves these using a conventional MAP method. We evaluate CPI on two tasks, Semantic Role Labelling and Joint Entity Resolution, while plugging in two different MAP inference methods: the current method of choice for MAP inference in Markov Logic, MaxWalkSAT, and Integer Linear Programming. We observe that when used with CPI both methods are significantly faster than when used alone. In addition, CPI improves the accuracy of MaxWalkSAT and maintains the exactness of Integer Linear Programming.},
author = {Riedel, Sebastian},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {468–475},
title = {Improving the accuracy and efficiency of MAP inference for Markov Logic},
year = {2008}
}

@inproceedings{56,
abstract = {Model-based Bayesian reinforcement learning has generated significant interest in the AI community as it provides an elegant solution to the optimal exploration-exploitation tradeoff in classical reinforcement learning. Unfortunately, the applicability of this type of approach has been limited to small domains due to the high complexity of reasoning about the joint posterior over model parameters. In this paper, we consider the use of factored representations combined with online planning techniques, to improve scalability of these methods. The main contribution of this paper is a Bayesian framework for learning the structure and parameters of a dynamical system, while also simultaneously planning a (near-)optimal sequence of actions.},
author = {Ross, St\'{e}phane and Pineau, Joelle},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {476–483},
title = {Model-based Bayesian reinforcement learning in large structured domains},
year = {2008}
}

@inproceedings{57,
abstract = {We present a generative model for representing and reasoning about the relationships among events in continuous time. We apply the model to the domain of networked and distributed computing environments where we fit the parameters of the model from timestamp observations, and then use hypothesis testing to discover dependencies between the events and changes in behavior for monitoring and diagnosis. After introducing the model, we present an EM algorithm for fitting the parameters and then present the hypothesis testing approach for both dependence discovery and change-point detection. We validate the approach for both tasks using real data from a trace of network events at Microsoft Research Cambridge. Finally, we formalize the relationship between the proposed model and the noisy-or gate for cases when time can be discretized.},
author = {Simma, Aleksandr and Goldszmidt, Moises and MacCormick, John and Barham, Paul and Black, Richard and Isaacs, Rebecca and Mortier, Richard},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {484–493},
title = {CT-NOR: representing and reasoning about events in continuous time},
year = {2008}
}

@inproceedings{58,
abstract = {Numerous temporal inference tasks such as fault monitoring and anomaly detection exhibit a persistence property: for example, if something breaks, it stays broken until an intervention. When modeled as a Dynamic Bayesian Network, persistence adds dependencies between adjacent time slices, often making exact inference over time intractable using standard inference algorithms. However, we show that persistence implies a regular structure that can be exploited for efficient inference. We present three successively more general classes of models: persistent causal chains (PCCs), persistent causal trees (PCTs) and persistent polytrees (PPTs), and the corresponding exact inference algorithms that exploit persistence. We show that analytic asymptotic bounds for our algorithms compare favorably to junction tree inference; and we demonstrate empirically that we can perform exact smoothing on the order of 100 times faster than the approximate Boyen-Koller method on randomly generated instances of persistent tree models. We also show how to handle non-persistent variables and how persistence can be exploited effectively for approximate filtering.},
author = {\v{S}ingliar, Tom\'{a}\v{s} and Dash, Denver H.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {494–502},
title = {Efficient inference in persistent Dynamic Bayesian Networks},
year = {2008}
}

@inproceedings{59,
abstract = {Linear Programming (LP) relaxations have become powerful tools for finding the most probable (MAP) configuration in graphical models. These relaxations can be solved efficiently using message-passing algorithms such as belief propagation and, when the relaxation is tight, provably find the MAP configuration. The standard LP relaxation is not tight enough in many real-world problems, however, and this has lead to the use of higher order cluster-based LP relaxations. The computational cost increases exponentially with the size of the clusters and limits the number and type of clusters we can use. We propose to solve the cluster selection problem monotonically in the dual LP, iteratively selecting clusters with guaranteed improvement, and quickly re-solving with the added clusters by reusing the existing solution. Our dual message-passing algorithm finds the MAP configuration in protein side-chain placement, protein design, and stereo problems, in cases where the standard LP relaxation fails.},
author = {Sontag, David and Meltzer, Talya and Globerson, Amir and Jaakkola, Tommi and Weiss, Yair},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {503–510},
title = {Tightening LP relaxations for MAP using message passing},
year = {2008}
}

@inproceedings{6,
abstract = {Continuous state spaces and stochastic, switching dynamics characterize a number of rich, real-world domains, such as robot navigation across varying terrain. We describe a reinforcement-learning algorithm for learning in these domains and prove for certain environments the algorithm is probably approximately correct with a sample complexity that scales polynomially with the state-space dimension. Unfortunately, no optimal planning techniques exist in general for such problems; instead we use fitted value iteration to solve the learned MDP, and include the error due to approximate planning in our bounds. Finally, we report an experiment using a robotic car driving over varying terrain to demonstrate that these dynamics representations adequately capture real-world dynamics and that our algorithm can be used to efficiently solve such problems.},
author = {Brunskill, Emma and Leffler, Bethany R. and Li, Lihong and Littman, Michael L. and Roy, Nicholas},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {53–61},
title = {CORL: a continuous-state offset-dynamics reinforcement learner},
year = {2008}
}

@inproceedings{60,
abstract = {In the Bayesian approach to structure learning of graphical models, the equivalent sample size (ESS) in the Dirichlet prior over the model parameters was recently shown to have an important effect on the maximum-a-posteriori estimate of the Bayesian network structure. In our first contribution, we theoretically analyze the case of large ESS-values, which complements previous work: among other results, we find that the presence of an edge in a Bayesian network is favoured over its absence even if both the Dirichlet prior and the data imply independence, as long as the conditional empirical distribution is notably different from uniform. In our second contribution, we focus on realistic ESS-values, and provide an analytical approximation to the 'optimal' ESS-value in a predictive sense (its accuracy is also validated experimentally): this approximation provides an understanding as to which properties of the data have the main effect determining the 'optimal' ESS-value.},
author = {Steck, Harald},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {511–518},
title = {Learning the Bayesian network structure: dirichlet prior versus data},
year = {2008}
}

@inproceedings{61,
abstract = {We present and evaluate new techniques for designing algorithm portfolios. In our view, the problem has both a scheduling aspect and a machine learning aspect. Prior work has largely addressed one of the two aspects in isolation. Building on recent work on the scheduling aspect of the problem, we present a technique that addresses both aspects simultaneously and has attractive theoretical guarantees. Experimentally, we show that this technique can be used to improve the performance of state-of-the-art algorithms for Boolean satisfiability, zero-one integer programming, and A.I. planning.},
author = {Streeter, Matthew and Smith, Stephen F.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {519–527},
title = {New techniques for algorithm portfolio design},
year = {2008}
}

@inproceedings{62,
abstract = {We consider the problem of efficiently learning optimal control policies and value functions over large state spaces in an online setting in which estimates must be available after each interaction with the world. This paper develops an explicitly model-based approach extending the Dyna architecture to linear function approximation. Dyna-style planning proceeds by generating imaginary experience from the world model and then applying model-free reinforcement learning algorithms to the imagined state transitions. Our main results are to prove that linear Dyna-style planning converges to a unique solution independent of the generating distribution, under natural conditions. In the policy evaluation setting, we prove that the limit point is the least-squares (LSTD) solution. An implication of our results is that prioritized-sweeping can be soundly extended to the linear approximation case, backing up to preceding features rather than to preceding states. We introduce two versions of prioritized sweeping with linear Dyna and briefly illustrate their performance empirically on the Mountain Car and Boyan Chain problems.},
author = {Sutton, Richard S. and Szepesv\'{a}ri, Csaba and Geramifard, Alborz and Bowling, Michael},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {528–536},
title = {Dyna-style planning with linear function approximation and prioritized sweeping},
year = {2008}
}

@inproceedings{63,
abstract = {Exemplar-based clustering methods have been shown to produce state-of-the-art results on a number of synthetic and real-world clustering problems. They are appealing because they offer computational benefits over latent-mean models and can handle arbitrary pairwise similarity measures between data points. However, when trying to recover underlying structure in clustering problems, tailored similarity measures are often not enough; we also desire control over the distribution of cluster sizes. Priors such as Dirichlet process priors allow the number of clusters to be unspecified while expressing priors over data partitions. To our knowledge, they have not been applied to exemplar-based models. We show how to incorporate priors, including Dirichlet process priors, into the recently introduced affinity propagation algorithm. We develop an efficient max-product belief propagation algorithm for our new model and demonstrate experimentally how the expanded range of clustering priors allows us to better recover true clusterings in situations where we have some information about the generating process.},
author = {Tarlow, Daniel and Zemel, Richard S. and Frey, Brendan J.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {537–545},
title = {Flexible priors for exemplar-based clustering},
year = {2008}
}

@inproceedings{64,
abstract = {A Chain Event Graph (CEG) is a graphical model which is designed to embody conditional independencies in problems whose state spaces are highly asymmetric and do not admit a natural product structure. In this paper we present a probability propagation algorithm which uses the topology of the CEG to build a transporter CEG. Intriguingly, the transporter CEG is directly analogous to the triangulated Bayesian Network (BN) in the more conventional junction tree propagation algorithms used with BNs. The propagation method uses factorization formulae also analogous to (but different from) the ones using potentials on cliques and separators of the BN. It appears that the methods will be typically more efficient than the BN algorithms when applied to contexts where there is significant asymmetry present.},
author = {Thwaites, Peter A. and Smith, Jim Q. and Cowell, Robert G.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {546–553},
title = {Propagation using Chain Event Graphs},
year = {2008}
}

@inproceedings{65,
abstract = {We address the problem of identifying dynamic sequential plans in the framework of causal Bayesian networks, and show that the problem is reduced to identifying causal effects, for which there are complete identification algorithms available in the literature.},
author = {Tian, Jin},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {554–561},
title = {Identifying dynamic sequential plans},
year = {2008}
}

@inproceedings{66,
abstract = {Planning can often be simplified by decomposing the task into smaller tasks arranged hierarchically. Charlin et al. [4] recently showed that the hierarchy discovery problem can be framed as a non-convex optimization problem. However, the inherent computational difficulty of solving such an optimization problem makes it hard to scale to real-world problems. In another line of research, Toussaint et al. [18] developed a method to solve planning problems by maximum-likelihood estimation. In this paper, we show how the hierarchy discovery problem in partially observable domains can be tackled using a similar maximum likelihood approach. Our technique first transforms the problem into a dynamic Bayesian network through which a hierarchical structure can naturally be discovered while optimizing the policy. Experimental results demonstrate that this approach scales better than previous techniques based on non-convex optimization.},
author = {Toussaint, Marc and Charlin, Laurent and Poupart, Pascal},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {562–570},
title = {Hierarchical POMDP controller optimization by likelihood maximization},
year = {2008}
}

@inproceedings{67,
abstract = {Much recent work has concerned sparse approximations to speed up the Gaussian process regression from the unfavorable O(n3) scaling in computational time to O(nm2). Thus far, work has concentrated on models with one covariance function. However, in many practical situations additive models with multiple covariance functions may perform better, since the data may contain both long and short length-scale phenomena. The long length-scales can be captured with global sparse approximations, such as fully independent conditional (FIC), and the short length-scales can be modeled naturally by covariance functions with compact support (CS). CS covariance functions lead to naturally sparse covariance matrices, which are computationally cheaper to handle than full covariance matrices. In this paper, we propose a new sparse Gaussian process model with two additive components: FIC for the long length-scales and CS covariance function for the short length-scales. We give theoretical and experimental results and show that under certain conditions the proposed model has the same computational complexity as FIC. We also compare the model performance of the proposed model to additive models approximated by fully and partially independent conditional (PIC). We use real data sets and show that our model outperforms FIC and PIC approximations for data sets with two additive phenomena.},
author = {Vanhatalo, Jarno and Vehtari, Aki},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {571–578},
title = {Modelling local and global phenomena with sparse Gaussian processes},
year = {2008}
}

@inproceedings{68,
abstract = {In this paper, we develop the continuous time dynamic topic model (cDTM). The cDTM is a dynamic topic model that uses Brownian motion to model the latent topics through a sequential collection of documents, where a "topic" is a pattern of word use that we expect to evolve over the course of the collection. We derive an efficient variational approximate inference algorithm that takes advantage of the sparsity of observations in text, a property that lets us easily handle many time points. In contrast to the cDTM, the original discrete-time dynamic topic model (dDTM) requires that time be discretized. Moreover, the complexity of variational inference for the dDTM grows quickly as time granularity increases, a drawback which limits fine-grained discretization. We demonstrate the cDTM on two news corpora, reporting both predictive perplexity and the novel task of time stamp prediction.},
author = {Wang, Chong and Blei, David and Heckerman, David},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {579–586},
title = {Continuous time dynamic topic models},
year = {2008}
}

@inproceedings{69,
abstract = {Variational Bayesian inference and (collapsed) Gibbs sampling are the two important classes of inference algorithms for Bayesian networks. Both have their advantages and disadvantages: collapsed Gibbs sampling is unbiased but is also inefficient for large count values and requires averaging over many samples to reduce variance. On the other hand, variational Bayesian inference is efficient and accurate for large count values but suffers from bias for small counts. We propose a hybrid algorithm that combines the best of both worlds: it samples very small counts and applies variational updates to large counts. This hybridization is shown to significantly improve test-set perplexity relative to variational inference at no computational cost.},
author = {Welling, Max and Teh, Yee Whye and Kappen, Bert},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {587–594},
title = {Hybrid variational/gibbs collapsed inference in topic models},
year = {2008}
}

@inproceedings{7,
abstract = {Assume that cause-effect relationships between variables can be described as a directed acyclic graph and the corresponding linear structural equation model. We consider the identification problem of total effects in the presence of latent variables and selection bias between a treatment variable and a response variable. Pearl and his colleagues provided the back door criterion, the front door criterion (Pearl, 2000) and the conditional instrumental variable method (Brito and Pearl, 2002) as identifiability criteria for total effects in the presence of latent variables, but not in the presence of selection bias. In order to solve this problem, we propose new graphical identifiability criteria for total effects based on the identifiable factor models. The results of this paper are useful to identify total effects in observational studies and provide a new viewpoint to the identification conditions of factor models.},
author = {Cai, Zhihong and Kuroki, Manabu},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {62–69},
title = {On identifying total effects in the presence of latent variables and selection bias},
year = {2008}
}

@inproceedings{70,
abstract = {The paper introduces a generalization for known probabilistic models such as log-linear and graphical models, called here multiplicative models. These models, that express probabilities via product of parameters are shown to capture multiple forms of contextual independence between variables, including decision graphs and noisy-OR functions. An inference algorithm for multiplicative models is provided and its correctness is proved. The complexity analysis of the inference algorithm uses a more refined parameter than the tree-width of the underlying graph, and shows the computational cost does not exceed that of the variable elimination algorithm in graphical models. The paper ends with examples where using the new models and algorithm is computationally beneficial.},
author = {Wexler, Ydo and Meek, Christopher},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {595–602},
title = {Inference for multiplicative models},
year = {2008}
}

@inproceedings{71,
abstract = {In this paper we introduce Refractor Importance Sampling (RIS), an improvement to reduce error variance in Bayesian network importance sampling propagation under evidential reasoning. We prove the existence of a collection of importance functions that are close to the optimal importance function under evidential reasoning. Based on this theoretic result we derive the RIS algorithm. RIS approaches the optimal importance function by applying localized arc changes to minimize the divergence between the evidence-adjusted importance function and the optimal importance function. The validity and performance of RIS is empirically tested with a large set of synthetic Bayesian networks and two real-world networks.},
author = {Yu, Haohai and van Engelen, Robert A.},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {603–609},
title = {Refractor importance sampling},
year = {2008}
}

@inproceedings{8,
abstract = {It is well-known that inference in graphical models is hard in the worst case, but tractable for models with bounded treewidth. We ask whether treewidth is the only structural criterion of the underlying graph that enables tractable inference. In other words, is there some class of structures with unbounded treewidth in which inference is tractable? Subject to a combinatorial hypothesis due to Robertson et al. (1994), we show that low treewidth is indeed the only structural restriction that can ensure tractability. Thus, even for the "best case" graph structure, there is no inference algorithm with complexity polynomial in the treewidth.},
author = {Chandrasekaran, Venkat and Srebro, Nathan and Harsha, Prahladh},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {70–78},
title = {Complexity of inference in graphical models},
year = {2008}
}

@inproceedings{9,
abstract = {We propose an approach for approximating the partition function which is based on two steps: (1) computing the partition function of a simplified model which is obtained by deleting model edges, and (2) rectifying the result by applying an edge-by-edge correction. The approach leads to an intuitive framework in which one can trade-off the quality of an approximation with the complexity of computing it. It also includes the Bethe free energy approximation as a degenerate case. We develop the approach theoretically in this paper and provide a number of empirical results that reveal its practical utility.},
author = {Choi, Arthur and Darwiche, Adnan},
editor = {McAllester, David A. and Myllym{"a}ki, Petri},
pages = {79–87},
title = {Approximating the partition function by deleting and then correcting for model edges},
year = {2008}
}