bayes-rules-notes/R/ch2.tex

381 lines
16 KiB
TeX
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
letterpaper,
DIV=11,
numbers=noendperiod]{scrartcl}
\usepackage{amsmath,amssymb}
\usepackage{lmodern}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{-\maxdimen} % remove section numbering
% Make \paragraph and \subparagraph free-standing
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{241,243,245}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}}
\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{caption}
\usepackage{longtable}
\KOMAoption{captions}{tableheading}
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[many]{tcolorbox}}
\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
\definecolor{quarto-callout-color}{HTML}{909090}
\definecolor{quarto-callout-note-color}{HTML}{0758E5}
\definecolor{quarto-callout-important-color}{HTML}{CC1914}
\definecolor{quarto-callout-warning-color}{HTML}{EB9113}
\definecolor{quarto-callout-tip-color}{HTML}{00A047}
\definecolor{quarto-callout-caution-color}{HTML}{FC5300}
\definecolor{quarto-callout-color-frame}{HTML}{acacac}
\definecolor{quarto-callout-note-color-frame}{HTML}{4582ec}
\definecolor{quarto-callout-important-color-frame}{HTML}{d9534f}
\definecolor{quarto-callout-warning-color-frame}{HTML}{f0ad4e}
\definecolor{quarto-callout-tip-color-frame}{HTML}{02b875}
\definecolor{quarto-callout-caution-color-frame}{HTML}{fd7e14}
\makeatother
\makeatletter
\makeatother
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
\renewcommand*\contentsname{Table of contents}
\else
\newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
\renewcommand*\listfigurename{List of Figures}
\else
\newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
\renewcommand*\listtablename{List of Tables}
\else
\newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
\renewcommand*\figurename{Figure}
\else
\newcommand\figurename{Figure}
\fi
\ifdefined\tablename
\renewcommand*\tablename{Table}
\else
\newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[many]{tcolorbox}}
\makeatother
\makeatletter
\@ifundefined{shadecolor}{\definecolor{shadecolor}{rgb}{.97, .97, .97}}
\makeatother
\makeatletter
\makeatother
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
pdftitle={Chapter 2 Notes},
pdfauthor={Emanuel Rodriguez},
colorlinks=true,
linkcolor={blue},
filecolor={Maroon},
citecolor={Blue},
urlcolor={Blue},
pdfcreator={LaTeX via pandoc}}
\title{Chapter 2 Notes}
\author{Emanuel Rodriguez}
\date{}
\begin{document}
\maketitle
\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[interior hidden, enhanced, breakable, frame hidden, sharp corners, boxrule=0pt, borderline west={3pt}{0pt}{shadecolor}]}{\end{tcolorbox}}\fi
In this chapter we step through an example of ``fake'' vs ``real'' news
to build a framework to determine the probability of real vs fake of a
new news article titled ``The President has a secret!''
\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# libraries}
\FunctionTok{library}\NormalTok{(bayesrules)}
\FunctionTok{library}\NormalTok{(dplyr)}
\FunctionTok{library}\NormalTok{(tidyr)}
\FunctionTok{library}\NormalTok{(gt)}
\FunctionTok{data}\NormalTok{(fake\_news)}
\NormalTok{fake\_news }\OtherTok{\textless{}{-}}\NormalTok{ tibble}\SpecialCharTok{::}\FunctionTok{as\_tibble}\NormalTok{(fake\_news)}
\end{Highlighting}
\end{Shaded}
What is the proportion of news articles that were labeled fake vs real.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fake\_news }\SpecialCharTok{|\textgreater{}} \FunctionTok{glimpse}\NormalTok{()}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
Rows: 150
Columns: 30
$ title <chr> "Clinton's Exploited Haiti Earthquake to Stea~
$ text <chr> "0 SHARES Facebook Twitter\n\nBernard Sansaric~
$ url <chr> "http://freedomdaily.com/former-haitian-senate~
$ authors <chr> NA, NA, "Sierra Marlee", "Jack Shafer,Nolan D"~
$ type <fct> fake, real, fake, real, fake, real, fake, fake~
$ title_words <int> 17, 18, 16, 11, 9, 12, 11, 18, 10, 13, 10, 11,~
$ text_words <int> 219, 509, 494, 268, 479, 220, 184, 500, 677, 4~
$ title_char <int> 110, 95, 96, 60, 54, 66, 86, 104, 66, 81, 59, ~
$ text_char <int> 1444, 3016, 2881, 1674, 2813, 1351, 1128, 3112~
$ title_caps <int> 0, 0, 1, 0, 0, 1, 0, 2, 1, 1, 0, 1, 0, 0, 0, 0~
$ text_caps <int> 1, 1, 3, 3, 0, 0, 0, 12, 12, 1, 2, 5, 1, 1, 6,~
$ title_caps_percent <dbl> 0.000000, 0.000000, 6.250000, 0.000000, 0.0000~
$ text_caps_percent <dbl> 0.4566210, 0.1964637, 0.6072874, 1.1194030, 0.~
$ title_excl <int> 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0~
$ text_excl <int> 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0~
$ title_excl_percent <dbl> 0.0000000, 0.0000000, 2.0833333, 0.0000000, 0.~
$ text_excl_percent <dbl> 0.00000000, 0.00000000, 0.06942034, 0.00000000~
$ title_has_excl <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE~
$ anger <dbl> 4.24, 2.28, 1.18, 4.66, 0.82, 1.29, 2.56, 3.47~
$ anticipation <dbl> 2.12, 1.71, 2.16, 1.79, 1.23, 0.43, 2.05, 1.74~
$ disgust <dbl> 2.54, 1.90, 0.98, 1.79, 0.41, 1.72, 2.05, 1.35~
$ fear <dbl> 3.81, 1.90, 1.57, 4.30, 0.82, 0.43, 5.13, 4.25~
$ joy <dbl> 1.27, 1.71, 1.96, 0.36, 1.23, 0.86, 1.54, 1.35~
$ sadness <dbl> 4.66, 1.33, 0.78, 1.79, 0.82, 0.86, 2.05, 1.93~
$ surprise <dbl> 2.12, 1.14, 1.18, 1.79, 0.82, 0.86, 1.03, 1.35~
$ trust <dbl> 2.97, 4.17, 3.73, 2.51, 2.46, 2.16, 5.13, 3.86~
$ negative <dbl> 8.47, 4.74, 3.33, 6.09, 2.66, 3.02, 4.10, 4.63~
$ positive <dbl> 3.81, 4.93, 5.49, 2.15, 4.30, 2.16, 4.10, 4.25~
$ text_syllables <int> 395, 845, 806, 461, 761, 376, 326, 891, 1133, ~
$ text_syllables_per_word <dbl> 1.803653, 1.660118, 1.631579, 1.720149, 1.5887~
\end{verbatim}
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fake\_news }\SpecialCharTok{|\textgreater{}}
\FunctionTok{group\_by}\NormalTok{(type) }\SpecialCharTok{|\textgreater{}}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{total =} \FunctionTok{n}\NormalTok{(),}
\AttributeTok{prop =}\NormalTok{ total }\SpecialCharTok{/} \FunctionTok{nrow}\NormalTok{(fake\_news)}
\NormalTok{ ) }
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
# A tibble: 2 x 3
type total prop
<fct> <int> <dbl>
1 fake 60 0.4
2 real 90 0.6
\end{verbatim}
If we let \(B\) be the event that a news article is ``fake'' news, and
\(B^c\) be the event that a news article is ``real'', we can write the
following:
\[P(B) = .4\] \[P(B^c) = .6\]
This is the first ``clue'' or set of data that we have to build into our
framework. Namely, majority of articles are ``real'', therefore we could
simply predict that the new article is ``real''. This updated sense or
reality now becomes our priors.
Getting additional data, and updating our priors, based on additional
data. The new observation we make is the use of exclamation marks ``!''.
We note that the use of ``!'' is more frequent in news articles labeled
as ``fake''. We will want to incorporate this into our framework to
decide whether the new incoming should be labelled as real or fake.
\hypertarget{likelihood}{%
\subsubsection{Likelihood}\label{likelihood}}
\begin{tcolorbox}[enhanced jigsaw, coltitle=black, rightrule=.15mm, bottomtitle=1mm, breakable, colframe=quarto-callout-note-color-frame, toprule=.15mm, colback=white, opacityback=0, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Probability and Likelihood}, bottomrule=.15mm, opacitybacktitle=0.6, toptitle=1mm, titlerule=0mm, arc=.35mm, leftrule=.75mm, left=2mm, colbacktitle=quarto-callout-note-color!10!white]
When the event \(B\) is known, then we can evaluate the uncertainy of
events \(A\) and \(A^c\) given \(B\)
\[P(A|B) \text{ vs } P(A^c|B)\]
If on the other hand, we know event \(A\) then we can evaluate the
relative compatability of data \(A\) with \(B\) and \(B^c\) using
likelihood functions
\[L(B|A) \text{ vs } L(B^c|A)\] \[=P(A|B) \text{ vs } P(A|B^c)\]
\end{tcolorbox}
So in our case, we don't know whether this new incoming article is real
or not, but we do know that the title has an exclamation mark. This
means we can evaluate how likely this article is real or not given that
it contains an ``!'' in the title using likelihood functions. We can
formualte this as:
\[L(B|A) \text{ vs } L(B^c|A)\]
And perform the computation in R as follows:
\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# if fake, what are the proprotions of ! vs no{-}!}
\NormalTok{prop\_of\_excl\_within\_type }\OtherTok{\textless{}{-}}\NormalTok{ fake\_news }\SpecialCharTok{|\textgreater{}}
\FunctionTok{group\_by}\NormalTok{(type, title\_has\_excl) }\SpecialCharTok{|\textgreater{}}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{total =} \FunctionTok{n}\NormalTok{()}
\NormalTok{ ) }\SpecialCharTok{|\textgreater{}}
\FunctionTok{ungroup}\NormalTok{() }\SpecialCharTok{|\textgreater{}}
\FunctionTok{group\_by}\NormalTok{(type) }\SpecialCharTok{|\textgreater{}}
\FunctionTok{summarise}\NormalTok{(}
\AttributeTok{has\_excl =}\NormalTok{ title\_has\_excl,}
\AttributeTok{prop\_within\_type =}\NormalTok{ total }\SpecialCharTok{/} \FunctionTok{sum}\NormalTok{(total)}
\NormalTok{ ) }
\end{Highlighting}
\end{Shaded}
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{prop\_of\_excl\_within\_type }\SpecialCharTok{|\textgreater{}}
\FunctionTok{pivot\_wider}\NormalTok{(}\AttributeTok{names\_from =} \StringTok{"type"}\NormalTok{, }\AttributeTok{values\_from =}\NormalTok{ prop\_within\_type) }\SpecialCharTok{|\textgreater{}}
\FunctionTok{gt}\NormalTok{() }\SpecialCharTok{|\textgreater{}}
\NormalTok{ gt}\SpecialCharTok{::}\FunctionTok{cols\_label}\NormalTok{(}
\AttributeTok{has\_excl =} \StringTok{"Contains Exclamtion"}\NormalTok{,}
\AttributeTok{fake =} \StringTok{"Fake"}\NormalTok{, }
\AttributeTok{real =} \StringTok{"Real"}\NormalTok{) }\SpecialCharTok{|\textgreater{}}
\NormalTok{ gt}\SpecialCharTok{::}\FunctionTok{fmt\_number}\NormalTok{(}\AttributeTok{columns=}\FunctionTok{c}\NormalTok{(}\StringTok{"fake"}\NormalTok{, }\StringTok{"real"}\NormalTok{), }\AttributeTok{decimals =} \DecValTok{3}\NormalTok{) }\SpecialCharTok{|\textgreater{}}
\NormalTok{ gt}\SpecialCharTok{::}\FunctionTok{cols\_width}\NormalTok{(}\FunctionTok{everything}\NormalTok{() }\SpecialCharTok{\textasciitilde{}} \FunctionTok{px}\NormalTok{(}\DecValTok{100}\NormalTok{))}
\end{Highlighting}
\end{Shaded}
\begin{longtable}{crr}
\toprule
Contains Exclamtion & Fake & Real \\
\midrule
FALSE & $0.733$ & $0.978$ \\
TRUE & $0.267$ & $0.022$ \\
\bottomrule
\end{longtable}
The table above also shows the likelihoods for the case when an article
does not contain exclamation point in the title.
\end{document}